On 24 June 2016 at 16:38, James Almer <jamr...@gmail.com> wrote: > On 6/24/2016 8:43 AM, Rostislav Pehlivanov wrote: > > From 154e4312b09f568108dd97089e394c10bb3c28a9 Mon Sep 17 00:00:00 2001 > > From: Rostislav Pehlivanov <rpehliva...@ob-encoder.com> > > Date: Thu, 23 Jun 2016 18:06:56 +0100 > > Subject: [PATCH 2/2] diracdsp: add dequantization SIMD > > > > Currently unused, to be used in the following commits. > > > > Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv> > > --- > > libavcodec/diracdsp.c | 24 ++++++++++++++++++++++++ > > libavcodec/diracdsp.h | 4 ++++ > > libavcodec/x86/diracdsp.asm | 36 ++++++++++++++++++++++++++++++++++++ > > libavcodec/x86/diracdsp_init.c | 2 ++ > > 4 files changed, 66 insertions(+) > > > > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c > > index ab8d149..cd1209e 100644 > > --- a/libavcodec/diracdsp.c > > +++ b/libavcodec/diracdsp.c > > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const > uint16_t *src, int stride, > > } > > } > > > > +#define DEQUANT_SUBBAND(PX) > \ > > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, > ptrdiff_t stride, \ > > + const int qf, const int qs, > int tot_v, int tot_h) \ > > +{ > \ > > + int i, y; > \ > > + for (y = 0; y < tot_v; y++) { > \ > > + PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst; > \ > > + for (i = 0; i < tot_h; i++) { > \ > > + c = *src_r++; > \ > > + sign = FFSIGN(c)*(!!c); > \ > > + c = (FFABS(c)*qf + qs) >> 2; > \ > > + *dst_r++ = c*sign; > \ > > + } > \ > > + src += tot_h << (sizeof(PX) >> 1); > \ > > + dst += stride; > \ > > + } > \ > > +} > > + > > +DEQUANT_SUBBAND(int16_t) > > +DEQUANT_SUBBAND(int32_t) > > + > > #define PIXFUNC(PFX, WIDTH) > \ > > c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## > _dirac_pixels ## WIDTH ## _c; \ > > c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## > _dirac_pixels ## WIDTH ## _l2_c; \ > > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c) > > c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c; > > c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c; > > > > + c->dequant_subband[0] = c->dequant_subband[2] = > dequant_subband_int16_t_c; > > + c->dequant_subband[1] = c->dequant_subband[3] = > dequant_subband_int32_t_c; > > + > > PIXFUNC(put, 8); > > PIXFUNC(put, 16); > > PIXFUNC(put, 32); > > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h > > index 25a872d..224828d 100644 > > --- a/libavcodec/diracdsp.h > > +++ b/libavcodec/diracdsp.h > > @@ -22,6 +22,7 @@ > > #define AVCODEC_DIRACDSP_H > > > > #include <stdint.h> > > +#include <stddef.h> > > > > typedef void (*dirac_weight_func)(uint8_t *block, int stride, int > log2_denom, int weight, int h); > > typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, > int stride, int log2_denom, int weightd, int weights, int h); > > @@ -46,6 +47,9 @@ typedef struct { > > void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t > *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int > idwt_stride, int width, int height/*mod 2*/); > > void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int > stride, const uint8_t *obmc_weight, int yblen); > > > > + /* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */ > > + void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t > stride, const int qf, const int qs, int tot_v, int tot_h); > > + > > dirac_weight_func weight_dirac_pixels_tab[3]; > > dirac_biweight_func biweight_dirac_pixels_tab[3]; > > } DiracDSPContext; > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > > index a0d6788..a764706 100644 > > --- a/libavcodec/x86/diracdsp.asm > > +++ b/libavcodec/x86/diracdsp.asm > > @@ -307,4 +307,40 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, > dst_stride, src, src_stride, w > > > > RET > > > > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, > const int qf, const int qs, int tot_v, int tot_h) > > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v, > tot_h > > + > > + movd m2, qfd > > + movd m3, qsd > > + SPLATD m2 > > + SPLATD m3 > > + mov r7, dstq > > + mov r8, tot_hq > > Replace every r7 and r8 with r3 and r4, make the cglobal line 7, 7, 4 > and the function will work on x86_32. > > > + > > + .loop_v: > > + mov dstq, r7 > > + mov tot_hq, r8 > > + > > + .loop_h: > > + movu m0, [srcq] > > + > > + pabsd m1, m0 > > + pmulld m1, m2 > > + paddd m1, m3 > > + psrld m1, 2 > > + psignd m1, m0 > > + > > + movu [dstq], m1 > > + > > + add srcq, mmsize > > + add dstq, mmsize > > + sub tot_hq, 4 > > + jl .loop_h > > Jump if greater. Also use tot_hd, or change the prototypes. > > > + > > + add r7, strideq > > + sub tot_vq, 1 > > + jl .loop_v > > Ditto. > > > + > > + RET > > + > > %endif > > diff --git a/libavcodec/x86/diracdsp_init.c > b/libavcodec/x86/diracdsp_init.c > > index 7fa554e..a1bab9c 100644 > > --- a/libavcodec/x86/diracdsp_init.c > > +++ b/libavcodec/x86/diracdsp_init.c > > @@ -48,6 +48,7 @@ void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int > dst_stride, const int16_t > > > > #if ARCH_X86_64 > > void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, > const uint8_t *src, int src_stride, int width, int height); > > +void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t > stride, const int qf, const int qs, int tot_v, int tot_h); > > #endif > > > > #if HAVE_YASM > > @@ -191,6 +192,7 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) > > > > #if ARCH_X86_64 > > if (EXTERNAL_SSE4(mm_flags)) { > > + c->dequant_subband[1] = ff_dequant_subband_32_sse4; > > c->put_signed_rect_clamped[1] = > ff_put_signed_rect_clamped_10_sse4; > > } > > #endif > > -- 2.8.1.369.geae769a > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >
I've attached another patch which should work fine now. I did this after the put_signed_rect so it does require the first patch, but if this patch is okay I'll amend and tidy things before I push. For some reason changing dstq to be stored at r4 or r3 broke it and I've no idea why. Neither is used after loading m2 and m3. Should work on x86_32 now, but I'm wondering why I can't save that register.
From 0bdf36a57038f733e02c1d6f8fb88a9fc9848d32 Mon Sep 17 00:00:00 2001 From: Rostislav Pehlivanov <rpehliva...@ob-encoder.com> Date: Thu, 23 Jun 2016 18:06:56 +0100 Subject: [PATCH] diracdsp: add dequantization SIMD Currently unused, to be used in the following commits. Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv> --- libavcodec/diracdsp.c | 24 ++++++++++++++++++++++++ libavcodec/diracdsp.h | 4 ++++ libavcodec/x86/diracdsp.asm | 38 +++++++++++++++++++++++++++++++++++++- libavcodec/x86/diracdsp_init.c | 7 +++++-- 4 files changed, 70 insertions(+), 3 deletions(-) diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c index ab8d149..cd1209e 100644 --- a/libavcodec/diracdsp.c +++ b/libavcodec/diracdsp.c @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const uint16_t *src, int stride, } } +#define DEQUANT_SUBBAND(PX) \ +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, ptrdiff_t stride, \ + const int qf, const int qs, int tot_v, int tot_h) \ +{ \ + int i, y; \ + for (y = 0; y < tot_v; y++) { \ + PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst; \ + for (i = 0; i < tot_h; i++) { \ + c = *src_r++; \ + sign = FFSIGN(c)*(!!c); \ + c = (FFABS(c)*qf + qs) >> 2; \ + *dst_r++ = c*sign; \ + } \ + src += tot_h << (sizeof(PX) >> 1); \ + dst += stride; \ + } \ +} + +DEQUANT_SUBBAND(int16_t) +DEQUANT_SUBBAND(int32_t) + #define PIXFUNC(PFX, WIDTH) \ c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _c; \ c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l2_c; \ @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c) c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c; c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c; + c->dequant_subband[0] = c->dequant_subband[2] = dequant_subband_int16_t_c; + c->dequant_subband[1] = c->dequant_subband[3] = dequant_subband_int32_t_c; + PIXFUNC(put, 8); PIXFUNC(put, 16); PIXFUNC(put, 32); diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h index 25a872d..224828d 100644 --- a/libavcodec/diracdsp.h +++ b/libavcodec/diracdsp.h @@ -22,6 +22,7 @@ #define AVCODEC_DIRACDSP_H #include <stdint.h> +#include <stddef.h> typedef void (*dirac_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int h); typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int stride, int log2_denom, int weightd, int weights, int h); @@ -46,6 +47,9 @@ typedef struct { void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int idwt_stride, int width, int height/*mod 2*/); void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); + /* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */ + void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h); + dirac_weight_func weight_dirac_pixels_tab[3]; dirac_biweight_func biweight_dirac_pixels_tab[3]; } DiracDSPContext; diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm index c5cc530..4bc8b2d 100644 --- a/libavcodec/x86/diracdsp.asm +++ b/libavcodec/x86/diracdsp.asm @@ -266,9 +266,45 @@ HPEL_FILTER sse2 ADD_OBMC 32, sse2 ADD_OBMC 16, sse2 -%if ARCH_X86_64 == 1 INIT_XMM sse4 +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h) +cglobal dequant_subband_32, 7, 8, 4, src, dst, stride, qf, qs, tot_v, tot_h + + movd m2, qfd + movd m3, qsd + SPLATD m2 + SPLATD m3 + mov r4, tot_hq + mov r7, dstq + + .loop_v: + mov tot_hq, r4 + mov dstq, r7 + + .loop_h: + movu m0, [srcq] + + pabsd m1, m0 + pmulld m1, m2 + paddd m1, m3 + psrld m1, 2 + psignd m1, m0 + + movu [dstq], m1 + + add srcq, mmsize + add dstq, mmsize + sub tot_hd, 4 + jg .loop_h + + add r7, strideq + dec tot_vd + jg .loop_v + + RET + +%if ARCH_X86_64 == 1 ; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height) cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c index 7fa554e..7f85518 100644 --- a/libavcodec/x86/diracdsp_init.c +++ b/libavcodec/x86/diracdsp_init.c @@ -50,6 +50,8 @@ void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height); #endif +void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h); + #if HAVE_YASM #define HPEL_FILTER(MMSIZE, EXT) \ @@ -189,9 +191,10 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; } -#if ARCH_X86_64 if (EXTERNAL_SSE4(mm_flags)) { + c->dequant_subband[1] = ff_dequant_subband_32_sse4; +#if ARCH_X86_64 c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4; - } #endif + } } -- 2.8.1.369.geae769a
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel