On 23 June 2016 at 20:57, James Almer <jamr...@gmail.com> wrote:

> On 6/23/2016 2:06 PM, Rostislav Pehlivanov wrote:
> > Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv>
> > ---
> >  libavcodec/x86/diracdsp.asm    | 47
> ++++++++++++++++++++++++++++++++++++++++++
> >  libavcodec/x86/diracdsp_init.c |  6 ++++++
> >  2 files changed, 53 insertions(+)
> >
> > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> > index a042413..9db7b67 100644
> > --- a/libavcodec/x86/diracdsp.asm
> > +++ b/libavcodec/x86/diracdsp.asm
> > @@ -22,6 +22,8 @@
> >
> >  SECTION_RODATA
> >  pw_7: times 8 dw 7
> > +convert_to_unsigned_10bit: times 4 dd 0x200
> > +clip_10bit:                times 8 dw 0x3ff
> >
> >  cextern pw_3
> >  cextern pw_16
> > @@ -172,6 +174,48 @@ cglobal put_signed_rect_clamped_%1, 5,9,3, dst,
> dst_stride, src, src_stride, w,
> >      RET
> >  %endm
> >
> > +%macro PUT_RECT_10 0
> > +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const
> uint8_t *src, int src_stride, int width, int height)
> > +cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src,
> src_stride, w, h
>
> This is x86_64 only. Either add the relevant pre-processor checks here
> and to the init file, or make the necessary changes to make it work
> on x86_32.
> Look at the 8bit version of put_signed_rect_clamped for an example of
> how to deal with this using stack.
>
> > +
> > +    neg      wq
> > +    neg      hq
>
> Why? You're not using these as part of effective addresses, just as
> counters. Keep them as is and just do sub instead of add in the loops
> below.
> For that matter, you'd need to sign extend these with movsxd before
> negating them, or change the prototype and make them ptrdiff_t instead
> of int.
>
> > +    mov      r6, srcq
> > +    mov      r7, dstq
> > +    mov      r8, wq
> > +    pxor     m2, m2
> > +    mova     m3, [clip_10bit]
> > +    mova     m4, [convert_to_unsigned_10bit]
> > +
> > +    .loop_h:
> > +    mov      srcq, r6
> > +    mov      dstq, r7
> > +    mov      wq,   r8
> > +
> > +    .loop_w:
> > +    movu     m0, [srcq+0*mmsize]
> > +    movu     m1, [srcq+1*mmsize]
> > +
> > +    paddd    m0, m4
> > +    paddd    m1, m4
> > +    packusdw m0, m0, m1
> > +    CLIPW    m0, m2, m3 ; packusdw saturates so it's fine
>
> Would be nice if you could make this work with SSE2 as well.
> There are some examples of packusdw SSE2 emulation in the codebase.
>
> > +
> > +    movu     [dstq], m0
> > +
> > +    add      srcq, 2*mmsize
> > +    add      dstq, 1*mmsize
> > +    add      wq, 8
> > +    jl       .loop_w
> > +
> > +    add      r6, src_strideq
> > +    add      r7, dst_strideq
> > +    add      hq, 1
>
> Make sure to do "sub wd, 8" and "sub hd, 1" after removing the above
> negs if don't change the prototype.
>
> > +    jl       .loop_h
> > +
> > +    RET
> > +%endm
> > +
> >  %macro ADD_RECT 1
> >  ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride,
> int16_t *idwt, int idwt_stride, int width, int height)
> >  cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt,
> idwt_stride, w, h
> > @@ -263,3 +307,6 @@ ADD_RECT sse2
> >  HPEL_FILTER sse2
> >  ADD_OBMC 32, sse2
> >  ADD_OBMC 16, sse2
> > +
> > +INIT_XMM sse4
> > +PUT_RECT_10
>
> No need to make it a macro if it's going to be a single version.
> If you add a SSE2 one then this would makes sense.
>
> > diff --git a/libavcodec/x86/diracdsp_init.c
> b/libavcodec/x86/diracdsp_init.c
> > index 5fae798..4786eea 100644
> > --- a/libavcodec/x86/diracdsp_init.c
> > +++ b/libavcodec/x86/diracdsp_init.c
> > @@ -46,6 +46,8 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int
> dst_stride, const int16_t *src,
> >  void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const
> int16_t *src, int src_stride, int width, int height);
> >  void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride,
> const int16_t *src, int src_stride, int width, int height);
> >
> > +void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride,
> const uint8_t *src, int src_stride, int width, int height);
> > +
> >  #if HAVE_YASM
> >
> >  #define HPEL_FILTER(MMSIZE, EXT)
>                      \
> > @@ -184,4 +186,8 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
> >          c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
> >          c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
> >      }
> > +
> > +    if (EXTERNAL_SSE4(mm_flags)) {
> > +        c->put_signed_rect_clamped[1] =
> ff_put_signed_rect_clamped_10_sse4;
> > +    }
> >  }
> >
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>

Like with the dequant asm, I've attached an amended patch

Thanks
From 86ecebfe70509329d6f5b8a587ae79d19f9c8154 Mon Sep 17 00:00:00 2001
From: Rostislav Pehlivanov <rpehliva...@ob-encoder.com>
Date: Thu, 23 Jun 2016 18:06:55 +0100
Subject: [PATCH 1/2] diracdsp: add SIMD for the 10 bit version of
 put_signed_rect_clamped

Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv>
---
 libavcodec/x86/diracdsp.asm    | 45 ++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/diracdsp_init.c | 10 ++++++++++
 2 files changed, 55 insertions(+)

diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index a042413..a0d6788 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -22,6 +22,8 @@
 
 SECTION_RODATA
 pw_7: times 8 dw 7
+convert_to_unsigned_10bit: times 4 dd 0x200
+clip_10bit:                times 8 dw 0x3ff
 
 cextern pw_3
 cextern pw_16
@@ -263,3 +265,46 @@ ADD_RECT sse2
 HPEL_FILTER sse2
 ADD_OBMC 32, sse2
 ADD_OBMC 16, sse2
+
+%if ARCH_X86_64 == 1
+INIT_XMM sse4
+
+; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h
+
+    mov      r6, srcq
+    mov      r7, dstq
+    mov      r8, wq
+    pxor     m2, m2
+    mova     m3, [clip_10bit]
+    mova     m4, [convert_to_unsigned_10bit]
+
+    .loop_h:
+    mov      srcq, r6
+    mov      dstq, r7
+    mov      wq,   r8
+
+    .loop_w:
+    movu     m0, [srcq+0*mmsize]
+    movu     m1, [srcq+1*mmsize]
+
+    paddd    m0, m4
+    paddd    m1, m4
+    packusdw m0, m0, m1
+    CLIPW    m0, m2, m3 ; packusdw saturates so it's fine
+
+    movu     [dstq], m0
+
+    add      srcq, 2*mmsize
+    add      dstq, 1*mmsize
+    sub      wq, 8
+    jl       .loop_w
+
+    add      r6, src_strideq
+    add      r7, dst_strideq
+    sub      hq, 1
+    jl       .loop_h
+
+    RET
+
+%endif
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
index 5fae798..7fa554e 100644
--- a/libavcodec/x86/diracdsp_init.c
+++ b/libavcodec/x86/diracdsp_init.c
@@ -46,6 +46,10 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src,
 void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
 void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
 
+#if ARCH_X86_64
+void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
+#endif
+
 #if HAVE_YASM
 
 #define HPEL_FILTER(MMSIZE, EXT)                                                             \
@@ -184,4 +188,10 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
         c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
         c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
     }
+
+#if ARCH_X86_64
+    if (EXTERNAL_SSE4(mm_flags)) {
+        c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
+    }
+#endif
 }
-- 
2.8.1.369.geae769a

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to