On Mon, Jun 27, 2016 at 12:53:47PM +0100, Rostislav Pehlivanov wrote: > On 24 June 2016 at 16:38, James Almer <jamr...@gmail.com> wrote: > > > On 6/24/2016 8:43 AM, Rostislav Pehlivanov wrote: > > > From 154e4312b09f568108dd97089e394c10bb3c28a9 Mon Sep 17 00:00:00 2001 > > > From: Rostislav Pehlivanov <rpehliva...@ob-encoder.com> > > > Date: Thu, 23 Jun 2016 18:06:56 +0100 > > > Subject: [PATCH 2/2] diracdsp: add dequantization SIMD > > > > > > Currently unused, to be used in the following commits. > > > > > > Signed-off-by: Rostislav Pehlivanov <rpehliva...@obe.tv> > > > --- > > > libavcodec/diracdsp.c | 24 ++++++++++++++++++++++++ > > > libavcodec/diracdsp.h | 4 ++++ > > > libavcodec/x86/diracdsp.asm | 36 ++++++++++++++++++++++++++++++++++++ > > > libavcodec/x86/diracdsp_init.c | 2 ++ > > > 4 files changed, 66 insertions(+) > > > > > > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c > > > index ab8d149..cd1209e 100644 > > > --- a/libavcodec/diracdsp.c > > > +++ b/libavcodec/diracdsp.c > > > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const > > uint16_t *src, int stride, > > > } > > > } > > > > > > +#define DEQUANT_SUBBAND(PX) > > \ > > > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, > > ptrdiff_t stride, \ > > > + const int qf, const int qs, > > int tot_v, int tot_h) \ > > > +{ > > \ > > > + int i, y; > > \ > > > + for (y = 0; y < tot_v; y++) { > > \ > > > + PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst; > > \ > > > + for (i = 0; i < tot_h; i++) { > > \ > > > + c = *src_r++; > > \ > > > + sign = FFSIGN(c)*(!!c); > > \ > > > + c = (FFABS(c)*qf + qs) >> 2; > > \ > > > + *dst_r++ = c*sign; > > \ > > > + } > > \ > > > + src += tot_h << (sizeof(PX) >> 1); > > \ > > > + dst += stride; > > \ > > > + } > > \ > > > +} > > > + > > > +DEQUANT_SUBBAND(int16_t) > > > +DEQUANT_SUBBAND(int32_t) > > > + > > > #define PIXFUNC(PFX, WIDTH) > > \ > > > c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## > > _dirac_pixels ## WIDTH ## _c; \ > > > c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## > > _dirac_pixels ## WIDTH ## _l2_c; \ > > > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c) > > > c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c; > > > c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c; > > > > > > + c->dequant_subband[0] = c->dequant_subband[2] = > > dequant_subband_int16_t_c; > > > + c->dequant_subband[1] = c->dequant_subband[3] = > > dequant_subband_int32_t_c; > > > + > > > PIXFUNC(put, 8); > > > PIXFUNC(put, 16); > > > PIXFUNC(put, 32); > > > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h > > > index 25a872d..224828d 100644 > > > --- a/libavcodec/diracdsp.h > > > +++ b/libavcodec/diracdsp.h > > > @@ -22,6 +22,7 @@ > > > #define AVCODEC_DIRACDSP_H > > > > > > #include <stdint.h> > > > +#include <stddef.h> > > > > > > typedef void (*dirac_weight_func)(uint8_t *block, int stride, int > > log2_denom, int weight, int h); > > > typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, > > int stride, int log2_denom, int weightd, int weights, int h); > > > @@ -46,6 +47,9 @@ typedef struct { > > > void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t > > *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int > > idwt_stride, int width, int height/*mod 2*/); > > > void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int > > stride, const uint8_t *obmc_weight, int yblen); > > > > > > + /* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */ > > > + void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t > > stride, const int qf, const int qs, int tot_v, int tot_h); > > > + > > > dirac_weight_func weight_dirac_pixels_tab[3]; > > > dirac_biweight_func biweight_dirac_pixels_tab[3]; > > > } DiracDSPContext; > > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm > > > index a0d6788..a764706 100644 > > > --- a/libavcodec/x86/diracdsp.asm > > > +++ b/libavcodec/x86/diracdsp.asm > > > @@ -307,4 +307,40 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, > > dst_stride, src, src_stride, w > > > > > > RET > > > > > > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, > > const int qf, const int qs, int tot_v, int tot_h) > > > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v, > > tot_h > > > + > > > + movd m2, qfd > > > + movd m3, qsd > > > + SPLATD m2 > > > + SPLATD m3 > > > + mov r7, dstq > > > + mov r8, tot_hq > > > > Replace every r7 and r8 with r3 and r4, make the cglobal line 7, 7, 4 > > and the function will work on x86_32. > > > > > + > > > + .loop_v: > > > + mov dstq, r7 > > > + mov tot_hq, r8 > > > + > > > + .loop_h: > > > + movu m0, [srcq] > > > + > > > + pabsd m1, m0 > > > + pmulld m1, m2 > > > + paddd m1, m3 > > > + psrld m1, 2 > > > + psignd m1, m0 > > > + > > > + movu [dstq], m1 > > > + > > > + add srcq, mmsize > > > + add dstq, mmsize > > > + sub tot_hq, 4 > > > + jl .loop_h > > > > Jump if greater. Also use tot_hd, or change the prototypes. > > > > > + > > > + add r7, strideq > > > + sub tot_vq, 1 > > > + jl .loop_v > > > > Ditto. > > > > > + > > > + RET > > > + > > > %endif > > > diff --git a/libavcodec/x86/diracdsp_init.c > > b/libavcodec/x86/diracdsp_init.c > > > index 7fa554e..a1bab9c 100644 > > > --- a/libavcodec/x86/diracdsp_init.c > > > +++ b/libavcodec/x86/diracdsp_init.c > > > @@ -48,6 +48,7 @@ void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int > > dst_stride, const int16_t > > > > > > #if ARCH_X86_64 > > > void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, > > const uint8_t *src, int src_stride, int width, int height); > > > +void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t > > stride, const int qf, const int qs, int tot_v, int tot_h); > > > #endif > > > > > > #if HAVE_YASM > > > @@ -191,6 +192,7 @@ void ff_diracdsp_init_x86(DiracDSPContext* c) > > > > > > #if ARCH_X86_64 > > > if (EXTERNAL_SSE4(mm_flags)) { > > > + c->dequant_subband[1] = ff_dequant_subband_32_sse4; > > > c->put_signed_rect_clamped[1] = > > ff_put_signed_rect_clamped_10_sse4; > > > } > > > #endif > > > -- 2.8.1.369.geae769a > > > > _______________________________________________ > > ffmpeg-devel mailing list > > ffmpeg-devel@ffmpeg.org > > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > > > I've attached another patch which should work fine now. > I did this after the put_signed_rect so it does require the first patch, > but if this patch is okay I'll amend and tidy things before I push. > For some reason changing dstq to be stored at r4 or r3 broke it and I've no > idea why. Neither is used after loading m2 and m3. Should work on x86_32 > now, but I'm wondering why I can't save that register.
on x86_32: YASM libavcodec/x86/diracdsp.o src/libavcodec/x86/diracdsp.asm:279: error: undefined symbol `r7' (first use) src/libavcodec/x86/diracdsp.asm:279: error: (Each undefined symbol is reported only once.) make: *** [libavcodec/x86/diracdsp.o] Error 1 btw you can test x86_32 on x86_64 easily something like this: ./configure --cc='ccache gcc' --arch=x86_32 --target-os=linux --extra-cflags=-m32 --extra-ldflags=-m32 --enable-cross-compile should be all thats needed [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB Those who are best at talking, realize last or never when they are wrong.
signature.asc
Description: Digital signature
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel