Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

Rostislav Pehlivanov Mon, 27 Jun 2016 04:54:46 -0700

On 24 June 2016 at 16:38, James Almer <[email protected]> wrote:

> On 6/24/2016 8:43 AM, Rostislav Pehlivanov wrote:
> > From 154e4312b09f568108dd97089e394c10bb3c28a9 Mon Sep 17 00:00:00 2001
> > From: Rostislav Pehlivanov <[email protected]>
> > Date: Thu, 23 Jun 2016 18:06:56 +0100
> > Subject: [PATCH 2/2] diracdsp: add dequantization SIMD
> >
> > Currently unused, to be used in the following commits.
> >
> > Signed-off-by: Rostislav Pehlivanov <[email protected]>
> > ---
> >  libavcodec/diracdsp.c          | 24 ++++++++++++++++++++++++
> >  libavcodec/diracdsp.h          |  4 ++++
> >  libavcodec/x86/diracdsp.asm    | 36 ++++++++++++++++++++++++++++++++++++
> >  libavcodec/x86/diracdsp_init.c |  2 ++
> >  4 files changed, 66 insertions(+)
> >
> > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
> > index ab8d149..cd1209e 100644
> > --- a/libavcodec/diracdsp.c
> > +++ b/libavcodec/diracdsp.c
> > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const
> uint16_t *src, int stride,
> >      }
> >  }
> >
> > +#define DEQUANT_SUBBAND(PX)
>                     \
> > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst,
> ptrdiff_t stride,     \
> > +                                         const int qf, const int qs,
> int tot_v, int tot_h) \
> > +{
>                     \
> > +    int i, y;
>                     \
> > +    for (y = 0; y < tot_v; y++) {
>                     \
> > +        PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;
>                     \
> > +        for (i = 0; i < tot_h; i++) {
>                     \
> > +            c = *src_r++;
>                     \
> > +            sign = FFSIGN(c)*(!!c);
>                     \
> > +            c = (FFABS(c)*qf + qs) >> 2;
>                    \
> > +            *dst_r++ = c*sign;
>                    \
> > +        }
>                     \
> > +        src += tot_h << (sizeof(PX) >> 1);
>                    \
> > +        dst += stride;
>                    \
> > +    }
>                     \
> > +}
> > +
> > +DEQUANT_SUBBAND(int16_t)
> > +DEQUANT_SUBBAND(int32_t)
> > +
> >  #define PIXFUNC(PFX, WIDTH)
>  \
> >      c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ##
> _dirac_pixels ## WIDTH ## _c; \
> >      c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ##
> _dirac_pixels ## WIDTH ## _l2_c; \
> > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c)
> >      c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
> >      c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
> >
> > +    c->dequant_subband[0] = c->dequant_subband[2] =
> dequant_subband_int16_t_c;
> > +    c->dequant_subband[1] = c->dequant_subband[3] =
> dequant_subband_int32_t_c;
> > +
> >      PIXFUNC(put, 8);
> >      PIXFUNC(put, 16);
> >      PIXFUNC(put, 32);
> > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
> > index 25a872d..224828d 100644
> > --- a/libavcodec/diracdsp.h
> > +++ b/libavcodec/diracdsp.h
> > @@ -22,6 +22,7 @@
> >  #define AVCODEC_DIRACDSP_H
> >
> >  #include <stdint.h>
> > +#include <stddef.h>
> >
> >  typedef void (*dirac_weight_func)(uint8_t *block, int stride, int
> log2_denom, int weight, int h);
> >  typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src,
> int stride, int log2_denom, int weightd, int weights, int h);
> > @@ -46,6 +47,9 @@ typedef struct {
> >      void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t
> *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int
> idwt_stride, int width, int height/*mod 2*/);
> >      void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int
> stride, const uint8_t *obmc_weight, int yblen);
> >
> > +    /* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
> > +    void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t
> stride, const int qf, const int qs, int tot_v, int tot_h);
> > +
> >      dirac_weight_func weight_dirac_pixels_tab[3];
> >      dirac_biweight_func biweight_dirac_pixels_tab[3];
> >  } DiracDSPContext;
> > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> > index a0d6788..a764706 100644
> > --- a/libavcodec/x86/diracdsp.asm
> > +++ b/libavcodec/x86/diracdsp.asm
> > @@ -307,4 +307,40 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst,
> dst_stride, src, src_stride, w
> >
> >      RET
> >
> > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
> const int qf, const int qs, int tot_v, int tot_h)
> > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v,
> tot_h
> > +
> > +    movd   m2, qfd
> > +    movd   m3, qsd
> > +    SPLATD m2
> > +    SPLATD m3
> > +    mov    r7, dstq
> > +    mov    r8, tot_hq
>
> Replace every r7 and r8 with r3 and r4, make the cglobal line 7, 7, 4
> and the function will work on x86_32.
>
> > +
> > +    .loop_v:
> > +    mov    dstq,   r7
> > +    mov    tot_hq, r8
> > +
> > +    .loop_h:
> > +    movu   m0, [srcq]
> > +
> > +    pabsd  m1, m0
> > +    pmulld m1, m2
> > +    paddd  m1, m3
> > +    psrld  m1,  2
> > +    psignd m1, m0
> > +
> > +    movu   [dstq], m1
> > +
> > +    add    srcq, mmsize
> > +    add    dstq, mmsize
> > +    sub    tot_hq, 4
> > +    jl     .loop_h
>
> Jump if greater. Also use tot_hd, or change the prototypes.
>
> > +
> > +    add    r7, strideq
> > +    sub    tot_vq, 1
> > +    jl     .loop_v
>
> Ditto.
>
> > +
> > +    RET
> > +
> >  %endif
> > diff --git a/libavcodec/x86/diracdsp_init.c
> b/libavcodec/x86/diracdsp_init.c
> > index 7fa554e..a1bab9c 100644
> > --- a/libavcodec/x86/diracdsp_init.c
> > +++ b/libavcodec/x86/diracdsp_init.c
> > @@ -48,6 +48,7 @@ void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int
> dst_stride, const int16_t
> >
> >  #if ARCH_X86_64
> >  void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride,
> const uint8_t *src, int src_stride, int width, int height);
> > +void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t
> stride, const int qf, const int qs, int tot_v, int tot_h);
> >  #endif
> >
> >  #if HAVE_YASM
> > @@ -191,6 +192,7 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
> >
> >  #if ARCH_X86_64
> >      if (EXTERNAL_SSE4(mm_flags)) {
> > +        c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
> >          c->put_signed_rect_clamped[1] =
> ff_put_signed_rect_clamped_10_sse4;
> >      }
> >  #endif
> > -- 2.8.1.369.geae769a
>
> _______________________________________________
> ffmpeg-devel mailing list
> [email protected]
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>


I've attached another patch which should work fine now.
I did this after the put_signed_rect so it does require the first patch,
but if this patch is okay I'll amend and tidy things before I push.
For some reason changing dstq to be stored at r4 or r3 broke it and I've no
idea why. Neither is used after loading m2 and m3. Should work on x86_32
now, but I'm wondering why I can't save that register.

From 0bdf36a57038f733e02c1d6f8fb88a9fc9848d32 Mon Sep 17 00:00:00 2001
From: Rostislav Pehlivanov <[email protected]>
Date: Thu, 23 Jun 2016 18:06:56 +0100
Subject: [PATCH] diracdsp: add dequantization SIMD

Currently unused, to be used in the following commits.

Signed-off-by: Rostislav Pehlivanov <[email protected]>
---
 libavcodec/diracdsp.c          | 24 ++++++++++++++++++++++++
 libavcodec/diracdsp.h          |  4 ++++
 libavcodec/x86/diracdsp.asm    | 38 +++++++++++++++++++++++++++++++++++++-
 libavcodec/x86/diracdsp_init.c |  7 +++++--
 4 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
index ab8d149..cd1209e 100644
--- a/libavcodec/diracdsp.c
+++ b/libavcodec/diracdsp.c
@@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const uint16_t *src, int stride,
     }
 }
 
+#define DEQUANT_SUBBAND(PX)                                                                \
+static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, ptrdiff_t stride,     \
+                                         const int qf, const int qs, int tot_v, int tot_h) \
+{                                                                                          \
+    int i, y;                                                                              \
+    for (y = 0; y < tot_v; y++) {                                                          \
+        PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;                                \
+        for (i = 0; i < tot_h; i++) {                                                      \
+            c = *src_r++;                                                                  \
+            sign = FFSIGN(c)*(!!c);                                                        \
+            c = (FFABS(c)*qf + qs) >> 2;                                                   \
+            *dst_r++ = c*sign;                                                             \
+        }                                                                                  \
+        src += tot_h << (sizeof(PX) >> 1);                                                 \
+        dst += stride;                                                                     \
+    }                                                                                      \
+}
+
+DEQUANT_SUBBAND(int16_t)
+DEQUANT_SUBBAND(int32_t)
+
 #define PIXFUNC(PFX, WIDTH)                                             \
     c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _c; \
     c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l2_c; \
@@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c)
     c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
     c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
 
+    c->dequant_subband[0] = c->dequant_subband[2] = dequant_subband_int16_t_c;
+    c->dequant_subband[1] = c->dequant_subband[3] = dequant_subband_int32_t_c;
+
     PIXFUNC(put, 8);
     PIXFUNC(put, 16);
     PIXFUNC(put, 32);
diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
index 25a872d..224828d 100644
--- a/libavcodec/diracdsp.h
+++ b/libavcodec/diracdsp.h
@@ -22,6 +22,7 @@
 #define AVCODEC_DIRACDSP_H
 
 #include <stdint.h>
+#include <stddef.h>
 
 typedef void (*dirac_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int h);
 typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int stride, int log2_denom, int weightd, int weights, int h);
@@ -46,6 +47,9 @@ typedef struct {
     void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int idwt_stride, int width, int height/*mod 2*/);
     void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
 
+    /* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
+    void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
+
     dirac_weight_func weight_dirac_pixels_tab[3];
     dirac_biweight_func biweight_dirac_pixels_tab[3];
 } DiracDSPContext;
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index c5cc530..4bc8b2d 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -266,9 +266,45 @@ HPEL_FILTER sse2
 ADD_OBMC 32, sse2
 ADD_OBMC 16, sse2
 
-%if ARCH_X86_64 == 1
 INIT_XMM sse4
 
+; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
+cglobal dequant_subband_32, 7, 8, 4, src, dst, stride, qf, qs, tot_v, tot_h
+
+    movd   m2, qfd
+    movd   m3, qsd
+    SPLATD m2
+    SPLATD m3
+    mov    r4, tot_hq
+    mov    r7, dstq
+
+    .loop_v:
+    mov    tot_hq, r4
+    mov    dstq,   r7
+
+    .loop_h:
+    movu   m0, [srcq]
+
+    pabsd  m1, m0
+    pmulld m1, m2
+    paddd  m1, m3
+    psrld  m1,  2
+    psignd m1, m0
+
+    movu   [dstq], m1
+
+    add    srcq, mmsize
+    add    dstq, mmsize
+    sub    tot_hd, 4
+    jg     .loop_h
+
+    add    r7, strideq
+    dec    tot_vd
+    jg     .loop_v
+
+    RET
+
+%if ARCH_X86_64 == 1
 ; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
 cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h
 
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
index 7fa554e..7f85518 100644
--- a/libavcodec/x86/diracdsp_init.c
+++ b/libavcodec/x86/diracdsp_init.c
@@ -50,6 +50,8 @@ void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t
 void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
 #endif
 
+void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
+
 #if HAVE_YASM
 
 #define HPEL_FILTER(MMSIZE, EXT)                                                             \
@@ -189,9 +191,10 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
         c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
     }
 
-#if ARCH_X86_64
     if (EXTERNAL_SSE4(mm_flags)) {
+        c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
+#if ARCH_X86_64
         c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
-    }
 #endif
+    }
 }
-- 
2.8.1.369.geae769a

_______________________________________________
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

Reply via email to