Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

Rostislav Pehlivanov Fri, 24 Jun 2016 04:44:00 -0700

On 23 June 2016 at 21:01, James Almer <jamr...@gmail.com> wrote:

> On 6/23/2016 2:06 PM, Rostislav Pehlivanov wrote:
> > Currently unused, to be used in the following commits.
> >
> > Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv>
> > ---
> >  libavcodec/diracdsp.c          | 24 ++++++++++++++++++++++++
> >  libavcodec/diracdsp.h          |  4 ++++
> >  libavcodec/x86/diracdsp.asm    | 41
> +++++++++++++++++++++++++++++++++++++++++
> >  libavcodec/x86/diracdsp_init.c |  4 +++-
> >  4 files changed, 72 insertions(+), 1 deletion(-)
> >
> > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
> > index ab8d149..d0cfd00 100644
> > --- a/libavcodec/diracdsp.c
> > +++ b/libavcodec/diracdsp.c
> > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const
> uint16_t *src, int stride,
> >      }
> >  }
> >
> > +#define DEQUANT_SUBBAND(PX)
>                             \
> > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst,
> ptrdiff_t stride,             \
> > +                                         const int qf, const int qs,
> int64_t tot_v, int64_t tot_h) \
>
> Shouldn't this be int (or ptrdiff_t)? Seeing they are int in the
> SliceCoeffs struct introduced by patch 6, i don't see why they
> should be int64_t here. Unless I'm missing something.
>
> > +{
>                             \
> > +    int i, y;
>                             \
> > +    for (y = 0; y < tot_v; y++) {
>                             \
> > +        PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;
>                             \
> > +        for (i = 0; i < tot_h; i++) {
>                             \
> > +            c = *src_r++;
>                             \
> > +            sign = FFSIGN(c)*(!!c);
>                             \
> > +            c = (FFABS(c)*qf + qs) >> 2;
>                            \
> > +            *dst_r++ = c*sign;
>                            \
> > +        }
>                             \
> > +        src += tot_h << (sizeof(PX) >> 1);
>                            \
> > +        dst += stride;
>                            \
> > +    }
>                             \
> > +}
> > +
> > +DEQUANT_SUBBAND(int16_t)
> > +DEQUANT_SUBBAND(int32_t)
> > +
> >  #define PIXFUNC(PFX, WIDTH)
>  \
> >      c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ##
> _dirac_pixels ## WIDTH ## _c; \
> >      c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ##
> _dirac_pixels ## WIDTH ## _l2_c; \
> > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c)
> >      c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
> >      c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
> >
> > +    c->dequant_subband[0] = c->dequant_subband[2] =
> dequant_subband_int16_t_c;
> > +    c->dequant_subband[1] = c->dequant_subband[3] =
> dequant_subband_int32_t_c;
> > +
> >      PIXFUNC(put, 8);
> >      PIXFUNC(put, 16);
> >      PIXFUNC(put, 32);
> > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
> > index 25a872d..c0ac56b 100644
> > --- a/libavcodec/diracdsp.h
> > +++ b/libavcodec/diracdsp.h
> > @@ -22,6 +22,7 @@
> >  #define AVCODEC_DIRACDSP_H
> >
> >  #include <stdint.h>
> > +#include <stddef.h>
> >
> >  typedef void (*dirac_weight_func)(uint8_t *block, int stride, int
> log2_denom, int weight, int h);
> >  typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src,
> int stride, int log2_denom, int weightd, int weights, int h);
> > @@ -46,6 +47,9 @@ typedef struct {
> >      void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t
> *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int
> idwt_stride, int width, int height/*mod 2*/);
> >      void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int
> stride, const uint8_t *obmc_weight, int yblen);
> >
> > +    /* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
> > +    void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t
> stride, const int qf, const int qs, int64_t tot_v, int64_t tot_h);
> > +
> >      dirac_weight_func weight_dirac_pixels_tab[3];
> >      dirac_biweight_func biweight_dirac_pixels_tab[3];
> >  } DiracDSPContext;
> > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> > index 9db7b67..f743363 100644
> > --- a/libavcodec/x86/diracdsp.asm
> > +++ b/libavcodec/x86/diracdsp.asm
> > @@ -289,6 +289,46 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src,
> stride, obmc, yblen
> >      RET
> >  %endm
> >
> > +%macro DEQUANT_SUBBAND_32 0
> > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
> const int qf, const int qs, int64_t tot_v, int64_t tot_h)
> > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v,
> tot_h
>
> Again, x86_64 only as is.
>
> > +
> > +    movd   m2, qfd
> > +    movd   m3, qsd
> > +    SPLATD m2
> > +    SPLATD m3
> > +    neg    tot_vq
> > +    neg    tot_hq
>
> Same as with put_signed_rect_clamped_10, no reason to neg these.
>
> > +    mov    r7, dstq
> > +    mov    r8, tot_hq
>
> You have qf and qs free. There's no need to use two extra registers.
> This and changing tot_v and tot_h to int/ptrdiff_t should make it work
> on x86_32 without extra work.
>
> > +
> > +    .loop_v:
> > +    mov    dstq,   r7
> > +    mov    tot_hq, r8
> > +
> > +    .loop_h:
> > +    movu   m0, [srcq]
> > +
> > +    pabsd  m1, m0
> > +    pmulld m1, m2
> > +    paddd  m1, m3
> > +    psrld  m1,  2
> > +    psignd m1, m0
> > +
> > +    movu   [dstq], m1
> > +
> > +    add    srcq, mmsize
> > +    add    dstq, mmsize
> > +    add    tot_hq, 4
> > +    jl     .loop_h
> > +
> > +    add    r7, strideq
> > +    add    tot_vq, 1
> > +    jl     .loop_v
> > +
> > +    RET
> > +%endm
> > +
> >  INIT_MMX
> >  %if ARCH_X86_64 == 0
> >  PUT_RECT mmx
> > @@ -310,3 +350,4 @@ ADD_OBMC 16, sse2
> >
> >  INIT_XMM sse4
> >  PUT_RECT_10
> > +DEQUANT_SUBBAND_32
>
> No reason to make it a macro. It's a single function.
>
> > diff --git a/libavcodec/x86/diracdsp_init.c
> b/libavcodec/x86/diracdsp_init.c
> > index 4786eea..8541eb3 100644
> > --- a/libavcodec/x86/diracdsp_init.c
> > +++ b/libavcodec/x86/diracdsp_init.c
> > @@ -45,9 +45,10 @@ void ff_put_rect_clamped_mmx(uint8_t *dst, int
> dst_stride, const int16_t *src, i
> >  void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const
> int16_t *src, int src_stride, int width, int height);
> >  void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const
> int16_t *src, int src_stride, int width, int height);
> >  void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride,
> const int16_t *src, int src_stride, int width, int height);
> > -
> >  void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride,
> const uint8_t *src, int src_stride, int width, int height);
> >
> > +void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t
> stride, const int qf, const int qs, int64_t tot_v, int64_t tot_h);
> > +
> >  #if HAVE_YASM
> >
> >  #define HPEL_FILTER(MMSIZE, EXT)
>                      \
> > @@ -188,6 +189,7 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
> >      }
> >
> >      if (EXTERNAL_SSE4(mm_flags)) {
> > +        c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
> >          c->put_signed_rect_clamped[1] =
> ff_put_signed_rect_clamped_10_sse4;
> >      }
> >  }
> >
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>


Thanks for the suggestions, I'm attaching an amended patch
Still x86_64, but it can always be changed later when I have the time.

From 154e4312b09f568108dd97089e394c10bb3c28a9 Mon Sep 17 00:00:00 2001
From: Rostislav Pehlivanov <rpehliva...@ob-encoder.com>
Date: Thu, 23 Jun 2016 18:06:56 +0100
Subject: [PATCH 2/2] diracdsp: add dequantization SIMD

Currently unused, to be used in the following commits.

Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv>
---
 libavcodec/diracdsp.c          | 24 ++++++++++++++++++++++++
 libavcodec/diracdsp.h          |  4 ++++
 libavcodec/x86/diracdsp.asm    | 36 ++++++++++++++++++++++++++++++++++++
 libavcodec/x86/diracdsp_init.c |  2 ++
 4 files changed, 66 insertions(+)

diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
index ab8d149..cd1209e 100644
--- a/libavcodec/diracdsp.c
+++ b/libavcodec/diracdsp.c
@@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const uint16_t *src, int stride,
     }
 }
 
+#define DEQUANT_SUBBAND(PX)                                                                \
+static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, ptrdiff_t stride,     \
+                                         const int qf, const int qs, int tot_v, int tot_h) \
+{                                                                                          \
+    int i, y;                                                                              \
+    for (y = 0; y < tot_v; y++) {                                                          \
+        PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;                                \
+        for (i = 0; i < tot_h; i++) {                                                      \
+            c = *src_r++;                                                                  \
+            sign = FFSIGN(c)*(!!c);                                                        \
+            c = (FFABS(c)*qf + qs) >> 2;                                                   \
+            *dst_r++ = c*sign;                                                             \
+        }                                                                                  \
+        src += tot_h << (sizeof(PX) >> 1);                                                 \
+        dst += stride;                                                                     \
+    }                                                                                      \
+}
+
+DEQUANT_SUBBAND(int16_t)
+DEQUANT_SUBBAND(int32_t)
+
 #define PIXFUNC(PFX, WIDTH)                                             \
     c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _c; \
     c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l2_c; \
@@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c)
     c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
     c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
 
+    c->dequant_subband[0] = c->dequant_subband[2] = dequant_subband_int16_t_c;
+    c->dequant_subband[1] = c->dequant_subband[3] = dequant_subband_int32_t_c;
+
     PIXFUNC(put, 8);
     PIXFUNC(put, 16);
     PIXFUNC(put, 32);
diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
index 25a872d..224828d 100644
--- a/libavcodec/diracdsp.h
+++ b/libavcodec/diracdsp.h
@@ -22,6 +22,7 @@
 #define AVCODEC_DIRACDSP_H
 
 #include <stdint.h>
+#include <stddef.h>
 
 typedef void (*dirac_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int h);
 typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int stride, int log2_denom, int weightd, int weights, int h);
@@ -46,6 +47,9 @@ typedef struct {
     void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int idwt_stride, int width, int height/*mod 2*/);
     void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
 
+    /* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
+    void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
+
     dirac_weight_func weight_dirac_pixels_tab[3];
     dirac_biweight_func biweight_dirac_pixels_tab[3];
 } DiracDSPContext;
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index a0d6788..a764706 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -307,4 +307,40 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w
 
     RET
 
+; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
+cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v, tot_h
+
+    movd   m2, qfd
+    movd   m3, qsd
+    SPLATD m2
+    SPLATD m3
+    mov    r7, dstq
+    mov    r8, tot_hq
+
+    .loop_v:
+    mov    dstq,   r7
+    mov    tot_hq, r8
+
+    .loop_h:
+    movu   m0, [srcq]
+
+    pabsd  m1, m0
+    pmulld m1, m2
+    paddd  m1, m3
+    psrld  m1,  2
+    psignd m1, m0
+
+    movu   [dstq], m1
+
+    add    srcq, mmsize
+    add    dstq, mmsize
+    sub    tot_hq, 4
+    jl     .loop_h
+
+    add    r7, strideq
+    sub    tot_vq, 1
+    jl     .loop_v
+
+    RET
+
 %endif
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
index 7fa554e..a1bab9c 100644
--- a/libavcodec/x86/diracdsp_init.c
+++ b/libavcodec/x86/diracdsp_init.c
@@ -48,6 +48,7 @@ void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t
 
 #if ARCH_X86_64
 void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
+void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
 #endif
 
 #if HAVE_YASM
@@ -191,6 +192,7 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
 
 #if ARCH_X86_64
     if (EXTERNAL_SSE4(mm_flags)) {
+        c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
         c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
     }
 #endif
-- 
2.8.1.369.geae769a

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

Reply via email to