On 6/23/2016 2:06 PM, Rostislav Pehlivanov wrote: > Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv> > --- > libavcodec/x86/diracdsp.asm | 47 > ++++++++++++++++++++++++++++++++++++++++++ > libavcodec/x86/diracdsp_init.c | 6 ++++++ > 2 files changed, 53 insertions(+) > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > index a042413..9db7b67 100644 > --- a/libavcodec/x86/diracdsp.asm > +++ b/libavcodec/x86/diracdsp.asm > @@ -22,6 +22,8 @@ > > SECTION_RODATA > pw_7: times 8 dw 7 > +convert_to_unsigned_10bit: times 4 dd 0x200 > +clip_10bit: times 8 dw 0x3ff > > cextern pw_3 > cextern pw_16 > @@ -172,6 +174,48 @@ cglobal put_signed_rect_clamped_%1, 5,9,3, dst, > dst_stride, src, src_stride, w, > RET > %endm > > +%macro PUT_RECT_10 0 > +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const > uint8_t *src, int src_stride, int width, int height) > +cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, > src_stride, w, h
This is x86_64 only. Either add the relevant pre-processor checks here and to the init file, or make the necessary changes to make it work on x86_32. Look at the 8bit version of put_signed_rect_clamped for an example of how to deal with this using stack. > + > + neg wq > + neg hq Why? You're not using these as part of effective addresses, just as counters. Keep them as is and just do sub instead of add in the loops below. For that matter, you'd need to sign extend these with movsxd before negating them, or change the prototype and make them ptrdiff_t instead of int. > + mov r6, srcq > + mov r7, dstq > + mov r8, wq > + pxor m2, m2 > + mova m3, [clip_10bit] > + mova m4, [convert_to_unsigned_10bit] > + > + .loop_h: > + mov srcq, r6 > + mov dstq, r7 > + mov wq, r8 > + > + .loop_w: > + movu m0, [srcq+0*mmsize] > + movu m1, [srcq+1*mmsize] > + > + paddd m0, m4 > + paddd m1, m4 > + packusdw m0, m0, m1 > + CLIPW m0, m2, m3 ; packusdw saturates so it's fine Would be nice if you could make this work with SSE2 as well. There are some examples of packusdw SSE2 emulation in the codebase. > + > + movu [dstq], m0 > + > + add srcq, 2*mmsize > + add dstq, 1*mmsize > + add wq, 8 > + jl .loop_w > + > + add r6, src_strideq > + add r7, dst_strideq > + add hq, 1 Make sure to do "sub wd, 8" and "sub hd, 1" after removing the above negs if don't change the prototype. > + jl .loop_h > + > + RET > +%endm > + > %macro ADD_RECT 1 > ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t > *idwt, int idwt_stride, int width, int height) > cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h > @@ -263,3 +307,6 @@ ADD_RECT sse2 > HPEL_FILTER sse2 > ADD_OBMC 32, sse2 > ADD_OBMC 16, sse2 > + > +INIT_XMM sse4 > +PUT_RECT_10 No need to make it a macro if it's going to be a single version. If you add a SSE2 one then this would makes sense. > diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c > index 5fae798..4786eea 100644 > --- a/libavcodec/x86/diracdsp_init.c > +++ b/libavcodec/x86/diracdsp_init.c > @@ -46,6 +46,8 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, > const int16_t *src, > void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const > int16_t *src, int src_stride, int width, int height); > void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const > int16_t *src, int src_stride, int width, int height); > > +void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const > uint8_t *src, int src_stride, int width, int height); > + > #if HAVE_YASM > > #define HPEL_FILTER(MMSIZE, EXT) > \ > @@ -184,4 +186,8 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) > c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2; > c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; > } > + > + if (EXTERNAL_SSE4(mm_flags)) { > + c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4; > + } > } > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel