On Fri, Mar 13, 2015 at 02:15:11PM -0400, Tucker DiNapoli wrote: > This change is to allow support for different sized blocks, which will > be necessary for sse and avx. My plan is for the code to still act on > 8x8 blocks, but to process multiple 8x8 blocks in parallel when using > sse/avx. > --- > libpostproc/postprocess.c | 3 --- > libpostproc/postprocess_c.c | 36 ++++++++++++++++++------------------ > libpostproc/postprocess_internal.h | 17 ++++++++++++++++- > libpostproc/postprocess_template.c | 18 +++++++++--------- > 4 files changed, 43 insertions(+), 31 deletions(-) > > diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c > index 2cdd988..3090869 100644 > --- a/libpostproc/postprocess.c > +++ b/libpostproc/postprocess.c > @@ -115,9 +115,6 @@ const char *postproc_license(void) > > #define GET_MODE_BUFFER_SIZE 500 > #define OPTIONS_ARRAY_SIZE 10 > -#define BLOCK_SIZE 8 > -#define TEMP_STRIDE 8 > -//#define NUM_BLOCKS_AT_ONCE 16 //not used yet > > #if ARCH_X86 && HAVE_INLINE_ASM > DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL; > diff --git a/libpostproc/postprocess_c.c b/libpostproc/postprocess_c.c > index 3d3b738..5660c64 100644 > --- a/libpostproc/postprocess_c.c > +++ b/libpostproc/postprocess_c.c > @@ -32,7 +32,7 @@ static inline int isHorizDC_C(const uint8_t src[], int > stride, const PPContext * > const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; > const int dcThreshold= dcOffset*2 + 1; > > - for(y=0; y<BLOCK_SIZE; y++){ > + for(y=0; y<block_height; y++){ > numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold; > numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold; > numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold; > @@ -56,7 +56,7 @@ static inline int isVertDC_C(const uint8_t src[], int > stride, const PPContext *c > const int dcThreshold= dcOffset*2 + 1; > > src+= stride*4; // src points to begin of the 8x8 Block > - for(y=0; y<BLOCK_SIZE-1; y++){ > + for(y=0; y<block_height-1; y++){ > numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < > dcThreshold; > numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < > dcThreshold; > numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < > dcThreshold; > @@ -90,7 +90,7 @@ static inline int isVertMinMaxOk_C(const uint8_t src[], int > stride, int QP) > { > int x; > src+= stride*4; > - for(x=0; x<BLOCK_SIZE; x+=4){ > + for(x=0; x<block_width; x+=4){ > if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > > 4*QP) return 0; > if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > > 4*QP) return 0; > if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > > 4*QP) return 0; > @@ -120,7 +120,7 @@ static inline int vertClassify_C(const uint8_t src[], int > stride, const PPContex > static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const > PPContext *c) > { > int y; > - for(y=0; y<BLOCK_SIZE; y++){ > + for(y=0; y<block_height; y++){ > const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]); > > if(FFABS(middleEnergy) < 8*c->QP){ > @@ -159,7 +159,7 @@ static inline void doHorizDefFilter_C(uint8_t dst[], int > stride, const PPContext > static inline void doHorizLowPass_C(uint8_t dst[], int stride, const > PPContext *c) > { > int y; > - for(y=0; y<BLOCK_SIZE; y++){ > + for(y=0; y<block_height; y++){ > const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0]; > const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7]; > > @@ -229,7 +229,7 @@ static inline void horizX1Filter(uint8_t *src, int > stride, int QP) > } > } > > - for(y=0; y<BLOCK_SIZE; y++){ > + for(y=0; y<block_height; y++){ > int a= src[1] - src[2]; > int b= src[3] - src[4]; > int c= src[5] - src[6]; > @@ -392,7 +392,7 @@ static inline void doVertLowPass_C(uint8_t *src, int > stride, PPContext *c) > const int l9= stride + l8; > int x; > src+= stride*3; > - for(x=0; x<BLOCK_SIZE; x++){ > + for(x=0; x<block_width; x++){ > const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; > const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; > > @@ -443,7 +443,7 @@ static inline void vertX1Filter_C(uint8_t *src, int > stride, PPContext *co) > int x; > > src+= stride*3; > - for(x=0; x<BLOCK_SIZE; x++){ > + for(x=0; x<block_width; x++){ > int a= src[l3] - src[l4]; > int b= src[l4] - src[l5]; > int c= src[l5] - src[l6]; > @@ -478,7 +478,7 @@ static inline void doVertDefFilter_C(uint8_t src[], int > stride, PPContext *c) > // const int l9= stride + l8; > int x; > src+= stride*3; > - for(x=0; x<BLOCK_SIZE; x++){ > + for(x=0; x<block_width; x++){ > const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - > src[l6]); > if(FFABS(middleEnergy) < 8*c->QP){ > const int q=(src[l4] - src[l5])/2; > @@ -881,13 +881,13 @@ static inline void blockCopy_C(uint8_t dst[], int > dstStride, const uint8_t src[] > { > int i; > if(levelFix){ > - for(i=0; i<8; i++) > + for(i=0; i<block_height; i++) > memcpy( &(dst[dstStride*i]), > - &(src[srcStride*i]), BLOCK_SIZE); > + &(src[srcStride*i]), block_width); > }else{ > - for(i=0; i<8; i++) > + for(i=0; i<block_height; i++) > memcpy( &(dst[dstStride*i]), > - &(src[srcStride*i]), BLOCK_SIZE); > + &(src[srcStride*i]), block_width); > } > } > > @@ -908,7 +908,7 @@ static inline void duplicate_C(uint8_t src[], int stride) > * Filter array of bytes (Y or U or V values) > */ > static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], > int dstStride, int width, int height, > - const QP_STORE_T QPs[], int QPStride, int > isColor, PPContext *c2) > + const QP_STORE_T QPs[], int QPStride, int isColor, > PPContext *c2) > { > DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access > int x,y; > @@ -999,7 +999,7 @@ static void postProcess_C(const uint8_t src[], int > srcStride, uint8_t dst[], int > } > > /* copy & deinterlace first row of blocks */ > - y=-BLOCK_SIZE; > + y=-block_height; > { > const uint8_t *srcBlock= &(src[y*srcStride]); > uint8_t *dstBlock= tempDst + dstStride; > @@ -1007,7 +1007,7 @@ static void postProcess_C(const uint8_t src[], int > srcStride, uint8_t dst[], int > // From this point on it is guaranteed that we can read and write 16 > lines downward > // finish 1 block before the next otherwise we might have a problem > // with the L1 Cache of the P4 ... or only a few blocks at a time or > something > - for(x=0; x<width; x+=BLOCK_SIZE){ > + for(x=0; x<width; x+=block_width){ > > > blockCopy_C(dstBlock + dstStride*8, dstStride, > @@ -1043,7 +1043,7 @@ static void postProcess_C(const uint8_t src[], int > srcStride, uint8_t dst[], int > } > } > > - for(y=0; y<height; y+=BLOCK_SIZE){ > + for(y=0; y<height; y+=block_height){ > //1% speedup if these are here instead of the inner loop > const uint8_t *srcBlock= &(src[y*srcStride]); > uint8_t *dstBlock= &(dst[y*dstStride]); > @@ -1077,7 +1077,7 @@ static void postProcess_C(const uint8_t src[], int > srcStride, uint8_t dst[], int > // From this point on it is guaranteed that we can read and write 16 > lines downward > // finish 1 block before the next otherwise we might have a problem > // with the L1 Cache of the P4 ... or only a few blocks at a time or > something > - for(x=0; x<width; x+=BLOCK_SIZE){ > + for(x=0; x<width; x+=block_width){ > const int stride= dstStride; > if(isColor){ > QP= QPptr[x>>qpHShift]; > diff --git a/libpostproc/postprocess_internal.h > b/libpostproc/postprocess_internal.h > index 1ebd974..5a7be1f 100644 > --- a/libpostproc/postprocess_internal.h > +++ b/libpostproc/postprocess_internal.h > @@ -174,5 +174,20 @@ static inline void linecpy(void *dest, const void *src, > int lines, int stride) { > memcpy((uint8_t*)dest+(lines-1)*stride, (const > uint8_t*)src+(lines-1)*stride, -lines*stride); > } > } > - > +/* > + Currently blocks are always 8xN bytes, where N is determined by the size > of > + the simd registers being used > +*/ > +static const int block_height = 8; > +#if ARCH_X86 && !CONFIG_RUNTIME_CPUDETECT > +#if HAVE_AVX2 > +static const int block_width = 32; > +#elif HAVE_SSE2 > +static const int block_width = 16; > +#else > +static const int block_width = 8; > +#endif > +#else > +static int block_width; //determined at runtime > +#endif > #endif /* POSTPROC_POSTPROCESS_INTERNAL_H */
non constant globals are not allowed, there can be multiple instances of the postprocess library and each might in theory use a different set of cpu flags and optimizations also this does not work all tests fail: TEST filter-pp --- ./tests/ref/fate/filter-pp 2015-03-13 19:34:25.103761168 +0100 +++ tests/data/fate/filter-pp 2015-03-13 19:49:45.559780559 +0100 @@ -1 +1 @@ -pp 3730f1ed7bf244ce059d110e21f39bbd +pp \ No newline at end of file Test filter-pp failed. Look at tests/data/fate/filter-pp.err for details. make: *** [fate-filter-pp] Error 139 TEST filter-pp1 --- ./tests/ref/fate/filter-pp1 2015-03-13 19:34:25.103761168 +0100 +++ tests/data/fate/filter-pp1 2015-03-13 19:49:45.647780561 +0100 @@ -1 +1 @@ -pp1 cb9f884e27a5be11f72afc9b517efd10 +pp1 \ No newline at end of file Test filter-pp1 failed. Look at tests/data/fate/filter-pp1.err for details. make: *** [fate-filter-pp1] Error 139 TEST filter-pp2 --- ./tests/ref/fate/filter-pp2 2015-03-13 19:34:25.103761168 +0100 +++ tests/data/fate/filter-pp2 2015-03-13 19:49:45.731780563 +0100 @@ -1 +1 @@ -pp2 975d4c1d489e42ddbd4a27464e8355af +pp2 \ No newline at end of file Test filter-pp2 failed. Look at tests/data/fate/filter-pp2.err for details. make: *** [fate-filter-pp2] Error 139 TEST filter-pp3 --- ./tests/ref/fate/filter-pp3 2015-03-13 19:34:25.103761168 +0100 +++ tests/data/fate/filter-pp3 2015-03-13 19:49:45.811780565 +0100 @@ -1 +1 @@ -pp3 ef0f10f1859af2f75717e8c9d64ee38a +pp3 \ No newline at end of file Test filter-pp3 failed. Look at tests/data/fate/filter-pp3.err for details. make: *** [fate-filter-pp3] Error 139 TEST filter-pp4 --- ./tests/ref/fate/filter-pp4 2015-03-13 19:34:25.103761168 +0100 +++ tests/data/fate/filter-pp4 2015-03-13 19:49:45.883780566 +0100 @@ -1 +1 @@ -pp4 0a2895c619ab9c6c22fd7cffb25070a8 +pp4 \ No newline at end of file Test filter-pp4 failed. Look at tests/data/fate/filter-pp4.err for details. make: *** [fate-filter-pp4] Error 139 TEST filter-pp5 --- ./tests/ref/fate/filter-pp5 2015-03-13 19:34:25.103761168 +0100 +++ tests/data/fate/filter-pp5 2015-03-13 19:49:45.963780568 +0100 @@ -1 +1 @@ -pp5 5fc6703d42bd98942e5dd104ce220291 +pp5 \ No newline at end of file Test filter-pp5 failed. Look at tests/data/fate/filter-pp5.err for details. make: *** [fate-filter-pp5] Error 139 TEST filter-pp6 --- ./tests/ref/fate/filter-pp6 2015-03-13 19:34:25.103761168 +0100 +++ tests/data/fate/filter-pp6 2015-03-13 19:49:46.035780569 +0100 @@ -1 +1 @@ -pp6 93b508d07bfcf4703aa7dff2d2ef5c03 +pp6 \ No newline at end of file Test filter-pp6 failed. Look at tests/data/fate/filter-pp6.err for details. make: *** [fate-filter-pp6] Error 139 and i do not understand how this could work, the QP parameter is at a granularity of 16x16 in the luma plane and 8x8 in the chroma planes the existing code uses one QP or 2 adjacent ones while processing a 8x8 block. If you increase the block size there will be more QP parameters to consider [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB If a bugfix only changes things apparently unrelated to the bug with no further explanation, that is a good sign that the bugfix is wrong.
signature.asc
Description: Digital signature
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel