From: Maxime Taisant <maximetais...@hotmail.fr> Hi,
I am currently working on SSE optimisations for the dwt functions used to decode JPEG2000. For the moment, I have only managed to produce a SSE-optimized version of the sr_1d97_float function (with relatively good results). I would like to have some comments on my work so far, to know if I am on the right track or if there is some parts that I need to improve or modify. Thank you. --- libavcodec/jpeg2000dwt.c | 5 +- libavcodec/jpeg2000dwt.h | 2 + libavcodec/x86/jpeg2000dsp.asm | 268 ++++++++++++++++++++++++++++++++++++++ libavcodec/x86/jpeg2000dsp_init.c | 3 + 4 files changed, 277 insertions(+), 1 deletion(-) diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c index 55dd5e89b5..b2a952aa29 100644 --- a/libavcodec/jpeg2000dwt.c +++ b/libavcodec/jpeg2000dwt.c @@ -425,7 +425,10 @@ static void dwt_decode97_float(DWTContext *s, float *t) for (i = 1 - mh; i < lh; i += 2, j++) l[i] = data[w * lp + j]; - sr_1d97_float(line, mh, mh + lh); + if (ARCH_X86) + ff_sr_1d97_float_sse(line, mh, mh + lh); + else + sr_1d97_float(line, mh, mh + lh); for (i = 0; i < lh; i++) data[w * lp + i] = l[i]; diff --git a/libavcodec/jpeg2000dwt.h b/libavcodec/jpeg2000dwt.h index 718d183ac1..59dec14478 100644 --- a/libavcodec/jpeg2000dwt.h +++ b/libavcodec/jpeg2000dwt.h @@ -65,4 +65,6 @@ int ff_dwt_decode(DWTContext *s, void *t); void ff_dwt_destroy(DWTContext *s); +void ff_sr_1d97_float_sse(float *p, int i0, int i1); + #endif /* AVCODEC_JPEG2000DWT_H */ diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm index 56b5fbd606..dabfb914b8 100644 --- a/libavcodec/x86/jpeg2000dsp.asm +++ b/libavcodec/x86/jpeg2000dsp.asm @@ -29,6 +29,16 @@ pf_ict1: times 8 dd 0.34413 pf_ict2: times 8 dd 0.71414 pf_ict3: times 8 dd 1.772 +F_LFTG_K: dd 1.230174104914001 +F_LFTG_X: dd 0.812893066115961 + +F_LFTG_ALPHA: times 8 dd 1.586134342059924 +F_LFTG_BETA: times 8 dd 0.052980118572961 +F_LFTG_GAMMA: times 8 dd 0.882911075530934 +F_LFTG_DELTA: times 8 dd 0.443506852043971 + +TWO: dd 2.0 + SECTION .text ;*********************************************************************** @@ -142,3 +152,261 @@ RCT_INT INIT_YMM avx2 RCT_INT %endif + +;*********************************************************************** +; ff_sr_ld97_float_<opt>(float *p, int i0, int i1) +;*********************************************************************** +%macro SR1D97FLOAT 0 +cglobal sr_1d97_float, 3, 5, 10, p, i0, i1, tmp0, tmp1 + mov tmp0q, i0q + mov tmp1q, i1q + add tmp0q, 1 + cmp tmp1q, tmp0q + jg .extend + sub tmp0q, 2 + jnz .else + movss m0, [pq+4] + movss m1, [F_LFTG_K] + movss m2, [TWO] + divss m1, m2 + mulss m0, m1 + movss [pq+4], m0 + jmp .end + +.else: + movss m0, [pq] + movss m1, [F_LFTG_X] + mulss m0, m1 + movss [pq], m0 + jmp .end + +.extend: + shl i0d, 2 + shl i1d, 2 + mov tmp0q, i0q + mov tmp1q, i1q + movups m0, [pq+tmp0q+4] + shufps m0, m0, 0x1B + movups [pq+tmp0q-16], m0 + movups m0, [pq+tmp1q-20] + shufps m0, m0, 0x1B + movups [pq+tmp1q], m0 + + movups m3, [F_LFTG_DELTA] + mov tmp0q, i0q + mov tmp1q, i1q + shr tmp0q, 1 + sub tmp0q, 4 + shr tmp1q, 1 + add tmp1q, 8 + cmp tmp0q, tmp1q + jge .beginloop2 +.loop1: + add tmp0q, 12 + cmp tmp0q, tmp1q + jge .endloop1 + + movups m0, [pq+2*tmp0q-28] + movups m4, [pq+2*tmp0q-12] + movups m1, m0 + shufps m0, m4, 0xDD + shufps m1, m4, 0x88 + movups m2, [pq+2*tmp0q-24] + movups m5, [pq+2*tmp0q-8] + shufps m2, m5, 0xDD + addps m2, m1 + mulps m2, m3 + subps m0, m2 + movups m4, m1 + shufps m1, m0, 0x44 + shufps m1, m1, 0xD8 + shufps m4, m0, 0xEE + shufps m4, m4, 0xD8 + movups [pq+2*tmp0q-28], m1 + movups [pq+2*tmp0q-12], m4 + + add tmp0q, 4 + cmp tmp0q, tmp1q + jge .beginloop2 + jmp .loop1 + +.endloop1: + sub tmp0q, 12 +.littleloop1: + movss m0, [pq+2*tmp0q] + movss m1, [pq+2*tmp0q-4] + movss m2, [pq+2*tmp0q+4] + addss m1, m2 + mulss m1, m3 + subss m0, m1 + movss [pq+2*tmp0q], m0 + add tmp0q, 4 + cmp tmp0q, tmp1q + jl .littleloop1 + +.beginloop2: + movups m3, [F_LFTG_GAMMA] + mov tmp0q, i0q + mov tmp1q, i1q + shr tmp0q, 1 + sub tmp0q, 4 + shr tmp1q, 1 + add tmp1q, 4 + cmp tmp0q, tmp1q + jge .beginloop3 +.loop2: + add tmp0q, 12 + cmp tmp0q, tmp1q + jge .endloop2 + + movups m0, [pq+2*tmp0q-24] + movups m4, [pq+2*tmp0q-8] + movups m1, m0 + shufps m0, m4, 0xDD + shufps m1, m4, 0x88 + movups m2, [pq+2*tmp0q-20] + movups m5, [pq+2*tmp0q-4] + shufps m2, m5, 0xDD + addps m2, m1 + mulps m2, m3 + subps m0, m2 + movups m4, m1 + shufps m1, m0, 0x44 + shufps m1, m1, 0xD8 + shufps m4, m0, 0xEE + shufps m4, m4, 0xD8 + movups [pq+2*tmp0q-24], m1 + movups [pq+2*tmp0q-8], m4 + + add tmp0q, 4 + cmp tmp0q, tmp1q + jge .beginloop3 + jmp .loop2 + +.endloop2: + sub tmp0q, 12 +.littleloop2: + movss m0, [pq+2*tmp0q+4] + movss m1, [pq+2*tmp0q] + movss m2, [pq+2*tmp0q+8] + addss m1, m2 + mulss m1, m3 + subss m0, m1 + movss [pq+2*tmp0q+4], m0 + add tmp0q, 4 + cmp tmp0q, tmp1q + jl .littleloop2 + +.beginloop3: + movups m3, [F_LFTG_BETA] + mov tmp0q, i0q + mov tmp1q, i1q + shr tmp0q, 1 + sub tmp0q, 4 + shr tmp1q, 1 + add tmp1q, 8 + cmp tmp0q, tmp1q + jge .beginloop4 +.loop3: + add tmp0q, 12 + cmp tmp0q, tmp1q + jge .endloop3 + + movups m0, [pq+2*tmp0q-28] + movups m4, [pq+2*tmp0q-12] + movups m1, m0 + shufps m0, m4, 0xDD + shufps m1, m4, 0x88 + movups m2, [pq+2*tmp0q-24] + movups m5, [pq+2*tmp0q-8] + shufps m2, m5, 0xDD + addps m2, m1 + mulps m2, m3 + addps m0, m2 + movups m4, m1 + shufps m1, m0, 0x44 + shufps m1, m1, 0xD8 + shufps m4, m0, 0xEE + shufps m4, m4, 0xD8 + movups [pq+2*tmp0q-28], m1 + movups [pq+2*tmp0q-12], m4 + + add tmp0q, 4 + cmp tmp0q, tmp1q + jge .beginloop4 + jmp .loop3 + +.endloop3: + sub tmp0q, 12 +.littleloop3: + movss m0, [pq+2*tmp0q] + movss m1, [pq+2*tmp0q-4] + movss m2, [pq+2*tmp0q+4] + addss m1, m2 + mulss m1, m3 + addss m0, m1 + movss [pq+2*tmp0q], m0 + add tmp0q, 4 + cmp tmp0q, tmp1q + jl .littleloop3 + +.beginloop4: + movups m3, [F_LFTG_ALPHA] + mov tmp0q, i0q + mov tmp1q, i1q + shr tmp0q, 1 + sub tmp0q, 4 + shr tmp1q, 1 + add tmp1q, 4 + cmp tmp0q, tmp1q + jge .end +.loop4: + add tmp0q, 12 + cmp tmp0q, tmp1q + jge .endloop4 + + movups m0, [pq+2*tmp0q-24] + movups m4, [pq+2*tmp0q-8] + movups m1, m0 + shufps m0, m4, 0xDD + shufps m1, m4, 0x88 + movups m2, [pq+2*tmp0q-20] + movups m5, [pq+2*tmp0q-4] + shufps m2, m5, 0xDD + addps m2, m1 + mulps m2, m3 + addps m0, m2 + movups m4, m1 + shufps m1, m0, 0x44 + shufps m1, m1, 0xD8 + shufps m4, m0, 0xEE + shufps m4, m4, 0xD8 + movups [pq+2*tmp0q-24], m1 + movups [pq+2*tmp0q-8], m4 + + add tmp0q, 4 + cmp tmp0q, tmp1q + jge .end + jmp .loop4 + +.endloop4: + sub tmp0q, 12 +.littleloop4: + movss m0, [pq+2*tmp0q+4] + movss m1, [pq+2*tmp0q] + movss m2, [pq+2*tmp0q+8] + addss m1, m2 + mulss m1, m3 + addss m0, m1 + movss [pq+2*tmp0q+4], m0 + add tmp0q, 4 + cmp tmp0q, tmp1q + jl .littleloop4 + +.end: + REP_RET +%endmacro + +INIT_XMM sse +SR1D97FLOAT + diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c index baa81383ea..3d3735c43a 100644 --- a/libavcodec/x86/jpeg2000dsp_init.c +++ b/libavcodec/x86/jpeg2000dsp_init.c @@ -23,12 +23,15 @@ #include "libavutil/cpu.h" #include "libavutil/x86/cpu.h" #include "libavcodec/jpeg2000dsp.h" +#include "libavcodec/jpeg2000dwt.h" void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize); void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize); void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize); void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize); +void ff_sr_1d97_float_sse(float *p, int i0, int i1); + av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c) { int cpu_flags = av_get_cpu_flags(); -- 2.11.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel