On 23 June 2016 at 21:01, James Almer <jamr...@gmail.com> wrote: > On 6/23/2016 2:06 PM, Rostislav Pehlivanov wrote: > > Currently unused, to be used in the following commits. > > > > Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv> > > --- > > libavcodec/diracdsp.c | 24 ++++++++++++++++++++++++ > > libavcodec/diracdsp.h | 4 ++++ > > libavcodec/x86/diracdsp.asm | 41 > +++++++++++++++++++++++++++++++++++++++++ > > libavcodec/x86/diracdsp_init.c | 4 +++- > > 4 files changed, 72 insertions(+), 1 deletion(-) > > > > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c > > index ab8d149..d0cfd00 100644 > > --- a/libavcodec/diracdsp.c > > +++ b/libavcodec/diracdsp.c > > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const > uint16_t *src, int stride, > > } > > } > > > > +#define DEQUANT_SUBBAND(PX) > \ > > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, > ptrdiff_t stride, \ > > + const int qf, const int qs, > int64_t tot_v, int64_t tot_h) \ > > Shouldn't this be int (or ptrdiff_t)? Seeing they are int in the > SliceCoeffs struct introduced by patch 6, i don't see why they > should be int64_t here. Unless I'm missing something. > > > +{ > \ > > + int i, y; > \ > > + for (y = 0; y < tot_v; y++) { > \ > > + PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst; > \ > > + for (i = 0; i < tot_h; i++) { > \ > > + c = *src_r++; > \ > > + sign = FFSIGN(c)*(!!c); > \ > > + c = (FFABS(c)*qf + qs) >> 2; > \ > > + *dst_r++ = c*sign; > \ > > + } > \ > > + src += tot_h << (sizeof(PX) >> 1); > \ > > + dst += stride; > \ > > + } > \ > > +} > > + > > +DEQUANT_SUBBAND(int16_t) > > +DEQUANT_SUBBAND(int32_t) > > + > > #define PIXFUNC(PFX, WIDTH) > \ > > c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## > _dirac_pixels ## WIDTH ## _c; \ > > c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## > _dirac_pixels ## WIDTH ## _l2_c; \ > > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c) > > c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c; > > c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c; > > > > + c->dequant_subband[0] = c->dequant_subband[2] = > dequant_subband_int16_t_c; > > + c->dequant_subband[1] = c->dequant_subband[3] = > dequant_subband_int32_t_c; > > + > > PIXFUNC(put, 8); > > PIXFUNC(put, 16); > > PIXFUNC(put, 32); > > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h > > index 25a872d..c0ac56b 100644 > > --- a/libavcodec/diracdsp.h > > +++ b/libavcodec/diracdsp.h > > @@ -22,6 +22,7 @@ > > #define AVCODEC_DIRACDSP_H > > > > #include <stdint.h> > > +#include <stddef.h> > > > > typedef void (*dirac_weight_func)(uint8_t *block, int stride, int > log2_denom, int weight, int h); > > typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, > int stride, int log2_denom, int weightd, int weights, int h); > > @@ -46,6 +47,9 @@ typedef struct { > > void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t > *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int > idwt_stride, int width, int height/*mod 2*/); > > void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int > stride, const uint8_t *obmc_weight, int yblen); > > > > + /* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */ > > + void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t > stride, const int qf, const int qs, int64_t tot_v, int64_t tot_h); > > + > > dirac_weight_func weight_dirac_pixels_tab[3]; > > dirac_biweight_func biweight_dirac_pixels_tab[3]; > > } DiracDSPContext; > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > > index 9db7b67..f743363 100644 > > --- a/libavcodec/x86/diracdsp.asm > > +++ b/libavcodec/x86/diracdsp.asm > > @@ -289,6 +289,46 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, > stride, obmc, yblen > > RET > > %endm > > > > +%macro DEQUANT_SUBBAND_32 0 > > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, > const int qf, const int qs, int64_t tot_v, int64_t tot_h) > > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v, > tot_h > > Again, x86_64 only as is. > > > + > > + movd m2, qfd > > + movd m3, qsd > > + SPLATD m2 > > + SPLATD m3 > > + neg tot_vq > > + neg tot_hq > > Same as with put_signed_rect_clamped_10, no reason to neg these. > > > + mov r7, dstq > > + mov r8, tot_hq > > You have qf and qs free. There's no need to use two extra registers. > This and changing tot_v and tot_h to int/ptrdiff_t should make it work > on x86_32 without extra work. > > > + > > + .loop_v: > > + mov dstq, r7 > > + mov tot_hq, r8 > > + > > + .loop_h: > > + movu m0, [srcq] > > + > > + pabsd m1, m0 > > + pmulld m1, m2 > > + paddd m1, m3 > > + psrld m1, 2 > > + psignd m1, m0 > > + > > + movu [dstq], m1 > > + > > + add srcq, mmsize > > + add dstq, mmsize > > + add tot_hq, 4 > > + jl .loop_h > > + > > + add r7, strideq > > + add tot_vq, 1 > > + jl .loop_v > > + > > + RET > > +%endm > > + > > INIT_MMX > > %if ARCH_X86_64 == 0 > > PUT_RECT mmx > > @@ -310,3 +350,4 @@ ADD_OBMC 16, sse2 > > > > INIT_XMM sse4 > > PUT_RECT_10 > > +DEQUANT_SUBBAND_32 > > No reason to make it a macro. It's a single function. > > > diff --git a/libavcodec/x86/diracdsp_init.c > b/libavcodec/x86/diracdsp_init.c > > index 4786eea..8541eb3 100644 > > --- a/libavcodec/x86/diracdsp_init.c > > +++ b/libavcodec/x86/diracdsp_init.c > > @@ -45,9 +45,10 @@ void ff_put_rect_clamped_mmx(uint8_t *dst, int > dst_stride, const int16_t *src, i > > void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const > int16_t *src, int src_stride, int width, int height); > > void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const > int16_t *src, int src_stride, int width, int height); > > void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, > const int16_t *src, int src_stride, int width, int height); > > - > > void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, > const uint8_t *src, int src_stride, int width, int height); > > > > +void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t > stride, const int qf, const int qs, int64_t tot_v, int64_t tot_h); > > + > > #if HAVE_YASM > > > > #define HPEL_FILTER(MMSIZE, EXT) > \ > > @@ -188,6 +189,7 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) > > } > > > > if (EXTERNAL_SSE4(mm_flags)) { > > + c->dequant_subband[1] = ff_dequant_subband_32_sse4; > > c->put_signed_rect_clamped[1] = > ff_put_signed_rect_clamped_10_sse4; > > } > > } > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >
Thanks for the suggestions, I'm attaching an amended patch Still x86_64, but it can always be changed later when I have the time.
From 154e4312b09f568108dd97089e394c10bb3c28a9 Mon Sep 17 00:00:00 2001 From: Rostislav Pehlivanov <rpehliva...@ob-encoder.com> Date: Thu, 23 Jun 2016 18:06:56 +0100 Subject: [PATCH 2/2] diracdsp: add dequantization SIMD Currently unused, to be used in the following commits. Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv> --- libavcodec/diracdsp.c | 24 ++++++++++++++++++++++++ libavcodec/diracdsp.h | 4 ++++ libavcodec/x86/diracdsp.asm | 36 ++++++++++++++++++++++++++++++++++++ libavcodec/x86/diracdsp_init.c | 2 ++ 4 files changed, 66 insertions(+) diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c index ab8d149..cd1209e 100644 --- a/libavcodec/diracdsp.c +++ b/libavcodec/diracdsp.c @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const uint16_t *src, int stride, } } +#define DEQUANT_SUBBAND(PX) \ +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, ptrdiff_t stride, \ + const int qf, const int qs, int tot_v, int tot_h) \ +{ \ + int i, y; \ + for (y = 0; y < tot_v; y++) { \ + PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst; \ + for (i = 0; i < tot_h; i++) { \ + c = *src_r++; \ + sign = FFSIGN(c)*(!!c); \ + c = (FFABS(c)*qf + qs) >> 2; \ + *dst_r++ = c*sign; \ + } \ + src += tot_h << (sizeof(PX) >> 1); \ + dst += stride; \ + } \ +} + +DEQUANT_SUBBAND(int16_t) +DEQUANT_SUBBAND(int32_t) + #define PIXFUNC(PFX, WIDTH) \ c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _c; \ c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l2_c; \ @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c) c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c; c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c; + c->dequant_subband[0] = c->dequant_subband[2] = dequant_subband_int16_t_c; + c->dequant_subband[1] = c->dequant_subband[3] = dequant_subband_int32_t_c; + PIXFUNC(put, 8); PIXFUNC(put, 16); PIXFUNC(put, 32); diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h index 25a872d..224828d 100644 --- a/libavcodec/diracdsp.h +++ b/libavcodec/diracdsp.h @@ -22,6 +22,7 @@ #define AVCODEC_DIRACDSP_H #include <stdint.h> +#include <stddef.h> typedef void (*dirac_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int h); typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int stride, int log2_denom, int weightd, int weights, int h); @@ -46,6 +47,9 @@ typedef struct { void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int idwt_stride, int width, int height/*mod 2*/); void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); + /* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */ + void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h); + dirac_weight_func weight_dirac_pixels_tab[3]; dirac_biweight_func biweight_dirac_pixels_tab[3]; } DiracDSPContext; diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm index a0d6788..a764706 100644 --- a/libavcodec/x86/diracdsp.asm +++ b/libavcodec/x86/diracdsp.asm @@ -307,4 +307,40 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w RET +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h) +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v, tot_h + + movd m2, qfd + movd m3, qsd + SPLATD m2 + SPLATD m3 + mov r7, dstq + mov r8, tot_hq + + .loop_v: + mov dstq, r7 + mov tot_hq, r8 + + .loop_h: + movu m0, [srcq] + + pabsd m1, m0 + pmulld m1, m2 + paddd m1, m3 + psrld m1, 2 + psignd m1, m0 + + movu [dstq], m1 + + add srcq, mmsize + add dstq, mmsize + sub tot_hq, 4 + jl .loop_h + + add r7, strideq + sub tot_vq, 1 + jl .loop_v + + RET + %endif diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c index 7fa554e..a1bab9c 100644 --- a/libavcodec/x86/diracdsp_init.c +++ b/libavcodec/x86/diracdsp_init.c @@ -48,6 +48,7 @@ void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t #if ARCH_X86_64 void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height); +void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h); #endif #if HAVE_YASM @@ -191,6 +192,7 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) #if ARCH_X86_64 if (EXTERNAL_SSE4(mm_flags)) { + c->dequant_subband[1] = ff_dequant_subband_32_sse4; c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4; } #endif -- 2.8.1.369.geae769a
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel