On 23 June 2016 at 20:57, James Almer <jamr...@gmail.com> wrote: > On 6/23/2016 2:06 PM, Rostislav Pehlivanov wrote: > > Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv> > > --- > > libavcodec/x86/diracdsp.asm | 47 > ++++++++++++++++++++++++++++++++++++++++++ > > libavcodec/x86/diracdsp_init.c | 6 ++++++ > > 2 files changed, 53 insertions(+) > > > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > > index a042413..9db7b67 100644 > > --- a/libavcodec/x86/diracdsp.asm > > +++ b/libavcodec/x86/diracdsp.asm > > @@ -22,6 +22,8 @@ > > > > SECTION_RODATA > > pw_7: times 8 dw 7 > > +convert_to_unsigned_10bit: times 4 dd 0x200 > > +clip_10bit: times 8 dw 0x3ff > > > > cextern pw_3 > > cextern pw_16 > > @@ -172,6 +174,48 @@ cglobal put_signed_rect_clamped_%1, 5,9,3, dst, > dst_stride, src, src_stride, w, > > RET > > %endm > > > > +%macro PUT_RECT_10 0 > > +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const > uint8_t *src, int src_stride, int width, int height) > > +cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, > src_stride, w, h > > This is x86_64 only. Either add the relevant pre-processor checks here > and to the init file, or make the necessary changes to make it work > on x86_32. > Look at the 8bit version of put_signed_rect_clamped for an example of > how to deal with this using stack. > > > + > > + neg wq > > + neg hq > > Why? You're not using these as part of effective addresses, just as > counters. Keep them as is and just do sub instead of add in the loops > below. > For that matter, you'd need to sign extend these with movsxd before > negating them, or change the prototype and make them ptrdiff_t instead > of int. > > > + mov r6, srcq > > + mov r7, dstq > > + mov r8, wq > > + pxor m2, m2 > > + mova m3, [clip_10bit] > > + mova m4, [convert_to_unsigned_10bit] > > + > > + .loop_h: > > + mov srcq, r6 > > + mov dstq, r7 > > + mov wq, r8 > > + > > + .loop_w: > > + movu m0, [srcq+0*mmsize] > > + movu m1, [srcq+1*mmsize] > > + > > + paddd m0, m4 > > + paddd m1, m4 > > + packusdw m0, m0, m1 > > + CLIPW m0, m2, m3 ; packusdw saturates so it's fine > > Would be nice if you could make this work with SSE2 as well. > There are some examples of packusdw SSE2 emulation in the codebase. > > > + > > + movu [dstq], m0 > > + > > + add srcq, 2*mmsize > > + add dstq, 1*mmsize > > + add wq, 8 > > + jl .loop_w > > + > > + add r6, src_strideq > > + add r7, dst_strideq > > + add hq, 1 > > Make sure to do "sub wd, 8" and "sub hd, 1" after removing the above > negs if don't change the prototype. > > > + jl .loop_h > > + > > + RET > > +%endm > > + > > %macro ADD_RECT 1 > > ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, > int16_t *idwt, int idwt_stride, int width, int height) > > cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, > idwt_stride, w, h > > @@ -263,3 +307,6 @@ ADD_RECT sse2 > > HPEL_FILTER sse2 > > ADD_OBMC 32, sse2 > > ADD_OBMC 16, sse2 > > + > > +INIT_XMM sse4 > > +PUT_RECT_10 > > No need to make it a macro if it's going to be a single version. > If you add a SSE2 one then this would makes sense. > > > diff --git a/libavcodec/x86/diracdsp_init.c > b/libavcodec/x86/diracdsp_init.c > > index 5fae798..4786eea 100644 > > --- a/libavcodec/x86/diracdsp_init.c > > +++ b/libavcodec/x86/diracdsp_init.c > > @@ -46,6 +46,8 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int > dst_stride, const int16_t *src, > > void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const > int16_t *src, int src_stride, int width, int height); > > void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, > const int16_t *src, int src_stride, int width, int height); > > > > +void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, > const uint8_t *src, int src_stride, int width, int height); > > + > > #if HAVE_YASM > > > > #define HPEL_FILTER(MMSIZE, EXT) > \ > > @@ -184,4 +186,8 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) > > c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2; > > c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; > > } > > + > > + if (EXTERNAL_SSE4(mm_flags)) { > > + c->put_signed_rect_clamped[1] = > ff_put_signed_rect_clamped_10_sse4; > > + } > > } > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >
Like with the dequant asm, I've attached an amended patch Thanks
From 86ecebfe70509329d6f5b8a587ae79d19f9c8154 Mon Sep 17 00:00:00 2001 From: Rostislav Pehlivanov <rpehliva...@ob-encoder.com> Date: Thu, 23 Jun 2016 18:06:55 +0100 Subject: [PATCH 1/2] diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv> --- libavcodec/x86/diracdsp.asm | 45 ++++++++++++++++++++++++++++++++++++++++++ libavcodec/x86/diracdsp_init.c | 10 ++++++++++ 2 files changed, 55 insertions(+) diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm index a042413..a0d6788 100644 --- a/libavcodec/x86/diracdsp.asm +++ b/libavcodec/x86/diracdsp.asm @@ -22,6 +22,8 @@ SECTION_RODATA pw_7: times 8 dw 7 +convert_to_unsigned_10bit: times 4 dd 0x200 +clip_10bit: times 8 dw 0x3ff cextern pw_3 cextern pw_16 @@ -263,3 +265,46 @@ ADD_RECT sse2 HPEL_FILTER sse2 ADD_OBMC 32, sse2 ADD_OBMC 16, sse2 + +%if ARCH_X86_64 == 1 +INIT_XMM sse4 + +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height) +cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h + + mov r6, srcq + mov r7, dstq + mov r8, wq + pxor m2, m2 + mova m3, [clip_10bit] + mova m4, [convert_to_unsigned_10bit] + + .loop_h: + mov srcq, r6 + mov dstq, r7 + mov wq, r8 + + .loop_w: + movu m0, [srcq+0*mmsize] + movu m1, [srcq+1*mmsize] + + paddd m0, m4 + paddd m1, m4 + packusdw m0, m0, m1 + CLIPW m0, m2, m3 ; packusdw saturates so it's fine + + movu [dstq], m0 + + add srcq, 2*mmsize + add dstq, 1*mmsize + sub wq, 8 + jl .loop_w + + add r6, src_strideq + add r7, dst_strideq + sub hq, 1 + jl .loop_h + + RET + +%endif diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c index 5fae798..7fa554e 100644 --- a/libavcodec/x86/diracdsp_init.c +++ b/libavcodec/x86/diracdsp_init.c @@ -46,6 +46,10 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); +#if ARCH_X86_64 +void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height); +#endif + #if HAVE_YASM #define HPEL_FILTER(MMSIZE, EXT) \ @@ -184,4 +188,10 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2; c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; } + +#if ARCH_X86_64 + if (EXTERNAL_SSE4(mm_flags)) { + c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4; + } +#endif } -- 2.8.1.369.geae769a
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel