On 24 June 2016 at 16:21, James Almer <jamr...@gmail.com> wrote: > On 6/24/2016 8:44 AM, Rostislav Pehlivanov wrote: > > From 86ecebfe70509329d6f5b8a587ae79d19f9c8154 Mon Sep 17 00:00:00 2001 > > From: Rostislav Pehlivanov <rpehliva...@ob-encoder.com> > > Date: Thu, 23 Jun 2016 18:06:55 +0100 > > Subject: [PATCH 1/2] diracdsp: add SIMD for the 10 bit version of > > put_signed_rect_clamped > > > > Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv> > > --- > > libavcodec/x86/diracdsp.asm | 45 > ++++++++++++++++++++++++++++++++++++++++++ > > libavcodec/x86/diracdsp_init.c | 10 ++++++++++ > > 2 files changed, 55 insertions(+) > > > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > > index a042413..a0d6788 100644 > > --- a/libavcodec/x86/diracdsp.asm > > +++ b/libavcodec/x86/diracdsp.asm > > @@ -22,6 +22,8 @@ > > > > SECTION_RODATA > > pw_7: times 8 dw 7 > > +convert_to_unsigned_10bit: times 4 dd 0x200 > > +clip_10bit: times 8 dw 0x3ff > > > > cextern pw_3 > > cextern pw_16 > > @@ -263,3 +265,46 @@ ADD_RECT sse2 > > HPEL_FILTER sse2 > > ADD_OBMC 32, sse2 > > ADD_OBMC 16, sse2 > > + > > +%if ARCH_X86_64 == 1 > > +INIT_XMM sse4 > > + > > +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const > uint8_t *src, int src_stride, int width, int height) > > +cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, > src_stride, w, h > > + > > + mov r6, srcq > > + mov r7, dstq > > + mov r8, wq > > + pxor m2, m2 > > + mova m3, [clip_10bit] > > + mova m4, [convert_to_unsigned_10bit] > > + > > + .loop_h: > > + mov srcq, r6 > > + mov dstq, r7 > > + mov wq, r8 > > + > > + .loop_w: > > + movu m0, [srcq+0*mmsize] > > + movu m1, [srcq+1*mmsize] > > + > > + paddd m0, m4 > > + paddd m1, m4 > > + packusdw m0, m0, m1 > > + CLIPW m0, m2, m3 ; packusdw saturates so it's fine > > + > > + movu [dstq], m0 > > + > > + add srcq, 2*mmsize > > + add dstq, 1*mmsize > > + sub wq, 8 > > + jl .loop_w > > Since you're substracting w now, this should be jump if greater. > > Also, use wd, not wq, since it comes from stack on Win64. With msvc > x86_64 afaik there's no guarantee that the upper half of the register > is zeroed. > > > + > > + add r6, src_strideq > > + add r7, dst_strideq > > + sub hq, 1 > > + jl .loop_h > > Ditto. > > Alternatively as i said before is to just change the prototypes to > use ptrdiff_t instead of int. > > > + > > + RET > > + > > +%endif > > diff --git a/libavcodec/x86/diracdsp_init.c > b/libavcodec/x86/diracdsp_init.c > > index 5fae798..7fa554e 100644 > > --- a/libavcodec/x86/diracdsp_init.c > > +++ b/libavcodec/x86/diracdsp_init.c > > @@ -46,6 +46,10 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int > dst_stride, const int16_t *src, > > void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const > int16_t *src, int src_stride, int width, int height); > > void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, > const int16_t *src, int src_stride, int width, int height); > > > > +#if ARCH_X86_64 > > +void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, > const uint8_t *src, int src_stride, int width, int height); > > +#endif > > + > > #if HAVE_YASM > > > > #define HPEL_FILTER(MMSIZE, EXT) > \ > > @@ -184,4 +188,10 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) > > c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2; > > c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; > > } > > + > > +#if ARCH_X86_64 > > + if (EXTERNAL_SSE4(mm_flags)) { > > + c->put_signed_rect_clamped[1] = > ff_put_signed_rect_clamped_10_sse4; > > + } > > +#endif > > } > > -- > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >
Attached a new patch, should be fine now. Chose not to change w and h to 64 bits since I'd have to do more changes to existing code.
From 56623038d6f73afc6c3723b57cde13b9e22a955e Mon Sep 17 00:00:00 2001 From: Rostislav Pehlivanov <rpehliva...@ob-encoder.com> Date: Thu, 23 Jun 2016 18:06:55 +0100 Subject: [PATCH] diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv> --- libavcodec/x86/diracdsp.asm | 45 ++++++++++++++++++++++++++++++++++++++++++ libavcodec/x86/diracdsp_init.c | 10 ++++++++++ 2 files changed, 55 insertions(+) diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm index a042413..c5cc530 100644 --- a/libavcodec/x86/diracdsp.asm +++ b/libavcodec/x86/diracdsp.asm @@ -22,6 +22,8 @@ SECTION_RODATA pw_7: times 8 dw 7 +convert_to_unsigned_10bit: times 4 dd 0x200 +clip_10bit: times 8 dw 0x3ff cextern pw_3 cextern pw_16 @@ -263,3 +265,46 @@ ADD_RECT sse2 HPEL_FILTER sse2 ADD_OBMC 32, sse2 ADD_OBMC 16, sse2 + +%if ARCH_X86_64 == 1 +INIT_XMM sse4 + +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height) +cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h + + mov r6, srcq + mov r7, dstq + mov r8, wq + pxor m2, m2 + mova m3, [clip_10bit] + mova m4, [convert_to_unsigned_10bit] + + .loop_h: + mov srcq, r6 + mov dstq, r7 + mov wq, r8 + + .loop_w: + movu m0, [srcq+0*mmsize] + movu m1, [srcq+1*mmsize] + + paddd m0, m4 + paddd m1, m4 + packusdw m0, m0, m1 + CLIPW m0, m2, m3 ; packusdw saturates so it's fine + + movu [dstq], m0 + + add srcq, 2*mmsize + add dstq, 1*mmsize + sub wd, 8 + jg .loop_w + + add r6, src_strideq + add r7, dst_strideq + sub hd, 1 + jg .loop_h + + RET + +%endif diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c index 5fae798..7fa554e 100644 --- a/libavcodec/x86/diracdsp_init.c +++ b/libavcodec/x86/diracdsp_init.c @@ -46,6 +46,10 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); +#if ARCH_X86_64 +void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height); +#endif + #if HAVE_YASM #define HPEL_FILTER(MMSIZE, EXT) \ @@ -184,4 +188,10 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2; c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; } + +#if ARCH_X86_64 + if (EXTERNAL_SSE4(mm_flags)) { + c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4; + } +#endif } -- 2.8.1.369.geae769a
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel