On 6/24/2016 8:44 AM, Rostislav Pehlivanov wrote: > From 86ecebfe70509329d6f5b8a587ae79d19f9c8154 Mon Sep 17 00:00:00 2001 > From: Rostislav Pehlivanov <rpehliva...@ob-encoder.com> > Date: Thu, 23 Jun 2016 18:06:55 +0100 > Subject: [PATCH 1/2] diracdsp: add SIMD for the 10 bit version of > put_signed_rect_clamped > > Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv> > --- > libavcodec/x86/diracdsp.asm | 45 > ++++++++++++++++++++++++++++++++++++++++++ > libavcodec/x86/diracdsp_init.c | 10 ++++++++++ > 2 files changed, 55 insertions(+) > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > index a042413..a0d6788 100644 > --- a/libavcodec/x86/diracdsp.asm > +++ b/libavcodec/x86/diracdsp.asm > @@ -22,6 +22,8 @@ > > SECTION_RODATA > pw_7: times 8 dw 7 > +convert_to_unsigned_10bit: times 4 dd 0x200 > +clip_10bit: times 8 dw 0x3ff > > cextern pw_3 > cextern pw_16 > @@ -263,3 +265,46 @@ ADD_RECT sse2 > HPEL_FILTER sse2 > ADD_OBMC 32, sse2 > ADD_OBMC 16, sse2 > + > +%if ARCH_X86_64 == 1 > +INIT_XMM sse4 > + > +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const > uint8_t *src, int src_stride, int width, int height) > +cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, > src_stride, w, h > + > + mov r6, srcq > + mov r7, dstq > + mov r8, wq > + pxor m2, m2 > + mova m3, [clip_10bit] > + mova m4, [convert_to_unsigned_10bit] > + > + .loop_h: > + mov srcq, r6 > + mov dstq, r7 > + mov wq, r8 > + > + .loop_w: > + movu m0, [srcq+0*mmsize] > + movu m1, [srcq+1*mmsize] > + > + paddd m0, m4 > + paddd m1, m4 > + packusdw m0, m0, m1 > + CLIPW m0, m2, m3 ; packusdw saturates so it's fine > + > + movu [dstq], m0 > + > + add srcq, 2*mmsize > + add dstq, 1*mmsize > + sub wq, 8 > + jl .loop_w
Since you're substracting w now, this should be jump if greater. Also, use wd, not wq, since it comes from stack on Win64. With msvc x86_64 afaik there's no guarantee that the upper half of the register is zeroed. > + > + add r6, src_strideq > + add r7, dst_strideq > + sub hq, 1 > + jl .loop_h Ditto. Alternatively as i said before is to just change the prototypes to use ptrdiff_t instead of int. > + > + RET > + > +%endif > diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c > index 5fae798..7fa554e 100644 > --- a/libavcodec/x86/diracdsp_init.c > +++ b/libavcodec/x86/diracdsp_init.c > @@ -46,6 +46,10 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int > dst_stride, const int16_t *src, > void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const > int16_t *src, int src_stride, int width, int height); > void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const > int16_t *src, int src_stride, int width, int height); > > +#if ARCH_X86_64 > +void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const > uint8_t *src, int src_stride, int width, int height); > +#endif > + > #if HAVE_YASM > > #define HPEL_FILTER(MMSIZE, EXT) > \ > @@ -184,4 +188,10 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) > c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2; > c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; > } > + > +#if ARCH_X86_64 > + if (EXTERNAL_SSE4(mm_flags)) { > + c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4; > + } > +#endif > } > -- _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel