On Wed, 6 Apr 2016 at 19:10 Ronald S. Bultje <rsbul...@gmail.com> wrote:
> --- > libavfilter/colorspacedsp.c | 3 + > libavfilter/colorspacedsp.h | 3 + > libavfilter/x86/Makefile | 2 + > libavfilter/x86/colorspacedsp.asm | 1115 > ++++++++++++++++++++++++++++++++++ > libavfilter/x86/colorspacedsp_init.c | 119 ++++ > tests/checkasm/Makefile | 1 + > tests/checkasm/checkasm.c | 3 + > tests/checkasm/checkasm.h | 1 + > tests/checkasm/vf_colorspace.c | 314 ++++++++++ > 9 files changed, 1561 insertions(+) > create mode 100644 libavfilter/x86/colorspacedsp.asm > create mode 100644 libavfilter/x86/colorspacedsp_init.c > create mode 100644 tests/checkasm/vf_colorspace.c > > diff --git a/libavfilter/colorspacedsp.c b/libavfilter/colorspacedsp.c > index 51a7c1d..d4c43c3 100644 > --- a/libavfilter/colorspacedsp.c > +++ b/libavfilter/colorspacedsp.c > @@ -128,4 +128,7 @@ void ff_colorspacedsp_init(ColorSpaceDSPContext *dsp) > init_yuv2yuv_fns(2, 12); > > dsp->multiply3x3 = multiply3x3_c; > + > + if (ARCH_X86) > + ff_colorspacedsp_x86_init(dsp); > } > diff --git a/libavfilter/colorspacedsp.h b/libavfilter/colorspacedsp.h > index 3571117..4e70c6c 100644 > --- a/libavfilter/colorspacedsp.h > +++ b/libavfilter/colorspacedsp.h > @@ -48,4 +48,7 @@ typedef struct ColorSpaceDSPContext { > > void ff_colorspacedsp_init(ColorSpaceDSPContext *dsp); > > +/* internal */ > +void ff_colorspacedsp_x86_init(ColorSpaceDSPContext *dsp); > + > #endif /* AVFILTER_COLORSPACEDSP_H */ > diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile > index ed294e0..4486b79 100644 > --- a/libavfilter/x86/Makefile > +++ b/libavfilter/x86/Makefile > @@ -1,5 +1,6 @@ > OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o > OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o > +OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o > OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o > OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o > OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o > @@ -23,6 +24,7 @@ OBJS-$(CONFIG_YADIF_FILTER) += > x86/vf_yadif_init.o > > YASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o > YASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o > +YASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o > YASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o > YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o > YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o > diff --git a/libavfilter/x86/colorspacedsp.asm > b/libavfilter/x86/colorspacedsp.asm > new file mode 100644 > index 0000000..e536566 > --- /dev/null > +++ b/libavfilter/x86/colorspacedsp.asm > @@ -0,0 +1,1115 @@ > > +;***************************************************************************** > +;* x86-optimized functions for colorspace filter > +;* > +;* Copyright (C) 2016 Ronald S. Bultje <rsbul...@gmail.com> > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +pw_1: times 8 dw 1 > +pw_2: times 8 dw 2 > +pw_4: times 8 dw 4 > +pw_8: times 8 dw 8 > +pw_16: times 8 dw 16 > +pw_64: times 8 dw 64 > +pw_128: times 8 dw 128 > +pw_256: times 8 dw 256 > +pw_512: times 8 dw 512 > +pw_1023: times 8 dw 1023 > +pw_1024: times 8 dw 1024 > +pw_2048: times 8 dw 2048 > +pw_4095: times 8 dw 4095 > +pw_8192: times 8 dw 8192 > +pw_16384: times 8 dw 16384 > + > +pd_1: times 4 dd 1 > +pd_2: times 4 dd 2 > +pd_128: times 4 dd 128 > +pd_512: times 4 dd 512 > +pd_2048: times 4 dd 2048 > +pd_8192: times 4 dd 8192 > +pd_32768: times 4 dd 32768 > +pd_131072: times 4 dd 131072 > > Don't we have these defined somewhere? > +SECTION .text > + > +; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t > yuv_out_stride[3], > +; uint8_t *yuv_in[3], ptrdiff_t > yuv_in_stride[3], > +; int w, int h, const int16_t > yuv2yuv_coeffs[3][3][8], > +; const int16_t yuv_offset[2][8]) > + > +%if ARCH_X86_64 > +%macro YUV2YUV_FN 4 ; in_bitdepth, out_bitdepth, log2_chroma_w (horiz), > log2_chroma_h (vert) > + > +%assign %%sh (14 + %1 - %2) > +%assign %%rnd (1 << (%%sh - 1)) > +%assign %%uvinoff (128 << (%1 - 8)) > +%assign %%uvoutoff (128 << (%2 - 8)) > +%if %3 == 0 > +%assign %%ss 444 > +%elif %4 == 0 > +%assign %%ss 422 > +%else ; %4 == 1 > +%assign %%ss 420 > +%endif ; %3/%4 > +%if %2 != 8 > +%assign %%maxval (1 << %2) - 1 > +%endif ; %2 != 8 > + > +%assign %%ypsh %%sh - 1 > +%if %%ypsh > 14 > +%assign %%yoffsh %%ypsh - 13 > +%assign %%ypsh 14 > +%else > +%assign %%yoffsh 1 > +%endif > +%assign %%yprnd (1 << (%%yoffsh - 1)) > +%assign %%ypmul (1 << %%ypsh) > + > +cglobal yuv2yuv_ %+ %%ss %+ p%1to%2, 8, 14, 16, 0 - (4 * mmsize), \ > + yo, yos, yi, yis, w, h, c, yoff, ui, > vi, uo, vo > +%if %3 == 1 > + inc wd > + sar wd, 1 > +%if %4 == 1 > + inc hd > + sar hd, 1 > +%endif ; %4 == 1 > +%endif ; %3 == 1 > + mov [rsp+3*mmsize+0], wd > + mov [rsp+3*mmsize+4], hd > + > + mova m10, [cq] > + pxor m11, m11 > + mova m12, [pd_ %+ %%uvoutoff] > + pslld m12, %%sh > + paddd m12, [pd_ %+ %%rnd] > + mova m13, [pw_ %+ %%uvinoff] > + mova m14, [yoffq+ 0] ; y_off_in > + mova m15, [yoffq+16] ; y_off_out > +%if %%yoffsh != 0 > + psllw m15, %%yoffsh > +%endif > + paddw m15, [pw_ %+ %%yprnd] > + punpcklwd m10, m15 > + mova m15, [pw_ %+ %%ypmul] > + movh m0, [cq+1*16] ; cyu > + movh m1, [cq+2*16] ; cyv > + movh m2, [cq+4*16] ; cuu > + movh m3, [cq+5*16] ; cuv > + movh m4, [cq+7*16] ; cvu > + movh m5, [cq+8*16] ; cvv > + punpcklwd m0, m1 > + punpcklwd m2, m3 > + punpcklwd m4, m5 > + mova [rsp+0*mmsize], m0 > + mova [rsp+1*mmsize], m2 > + mova [rsp+2*mmsize], m4 > + > + DEFINE_ARGS yo, yos, yi, yis, ui, vi, uo, vo, uis, vis, uos, vos, x, > tmp > + > + mov uiq, [yiq+gprsize*1] > + mov viq, [yiq+gprsize*2] > + mov yiq, [yiq+gprsize*0] > + mov uoq, [yoq+gprsize*1] > + mov voq, [yoq+gprsize*2] > + mov yoq, [yoq+gprsize*0] > + mov uisq, [yisq+gprsize*1] > + mov visq, [yisq+gprsize*2] > + mov yisq, [yisq+gprsize*0] > + mov uosq, [yosq+gprsize*1] > + mov vosq, [yosq+gprsize*2] > + mov yosq, [yosq+gprsize*0] > + > +.loop_v: > + xor xq, xq > + > +.loop_h: > +%if %4 == 1 > + lea tmpq, [yiq+yisq] > +%endif ; %4 == 1 > +%if %1 == 8 > + movu m0, [yiq+xq*(1<<%3)] ; y00/01 > +%if %4 == 1 > + movu m2, [tmpq+xq*2] ; y10/11 > +%endif ; %4 == 1 > +%if %3 == 1 > + movh m4, [uiq+xq] ; u > + movh m5, [viq+xq] ; v > +%else ; %3 != 1 > + movu m4, [uiq+xq] ; u > + movu m5, [viq+xq] ; v > +%endif ; %3 ==/!= 1 > + punpckhbw m1, m0, m11 > + punpcklbw m0, m11 > +%if %4 == 1 > + punpckhbw m3, m2, m11 > + punpcklbw m2, m11 > +%endif ; %4 == 1 > +%if %3 == 0 > + punpckhbw m2, m4, m11 > + punpckhbw m3, m5, m11 > +%endif ; %3 == 0 > + punpcklbw m4, m11 > + punpcklbw m5, m11 > +%else ; %1 != 8 > + movu m0, [yiq+xq*(2<<%3)] ; y00/01 > + movu m1, [yiq+xq*(2<<%3)+mmsize] ; y00/01 > +%if %4 == 1 > + movu m2, [tmpq+xq*4] ; y10/11 > + movu m3, [tmpq+xq*4+mmsize] ; y10/11 > +%endif ; %4 == 1 > + movu m4, [uiq+xq*2] ; u > + movu m5, [viq+xq*2] ; v > +%if %3 == 0 > + movu m2, [uiq+xq*2+mmsize] > + movu m3, [viq+xq*2+mmsize] > +%endif ; %3 == 0 > +%endif ; %1 ==/!= 8 > + psubw m0, m14 > + psubw m1, m14 > +%if %4 == 1 > + psubw m2, m14 > + psubw m3, m14 > +%endif ; %4 == 1 > + psubw m4, m13 > + psubw m5, m13 > +%if %3 == 0 > + psubw m2, m13 > + psubw m3, m13 > +%endif ; %3 == 0 > + > + SBUTTERFLY wd, 4, 5, 6 > + pmaddwd m6, m4, [rsp+1*mmsize] > + pmaddwd m7, m5, [rsp+1*mmsize] > +%if %3 == 0 > + SBUTTERFLY wd, 2, 3, 8 > + pmaddwd m8, m2, [rsp+1*mmsize] > + pmaddwd m9, m3, [rsp+1*mmsize] > +%else ; %3 != 0 > + pmaddwd m8, m4, [rsp+2*mmsize] > + pmaddwd m9, m5, [rsp+2*mmsize] > +%endif > + paddd m6, m12 > + paddd m7, m12 > + paddd m8, m12 > + paddd m9, m12 > + psrad m6, %%sh > + psrad m7, %%sh > + psrad m8, %%sh > + psrad m9, %%sh > + packssdw m6, m7 > + packssdw m8, m9 > +%if %2 == 8 > + packuswb m6, m8 > +%if %3 == 0 > + movu [uoq+xq], m6 > +%else ; %3 != 0 > + movh [uoq+xq], m6 > + movhps [voq+xq], m6 > +%endif ; %3 ==/!= 0 > +%else ; %2 != 8 > + pmaxsw m6, m11 > + pmaxsw m8, m11 > + pminsw m6, [pw_ %+ %%maxval] > + pminsw m8, [pw_ %+ %%maxval] > CLIPW > + movu [uoq+xq*2], m6 > +%if %3 == 0 > + movu [uoq+xq*2+mmsize], m8 > +%else ; %3 != 0 > + movu [voq+xq*2], m8 > +%endif ; %3 ==/!= 0 > +%endif ; %2 ==/!= 8 > + > +%if %3 == 0 > + pmaddwd m6, m4, [rsp+2*mmsize] > + pmaddwd m7, m5, [rsp+2*mmsize] > + pmaddwd m8, m2, [rsp+2*mmsize] > + pmaddwd m9, m3, [rsp+2*mmsize] > + paddd m6, m12 > + paddd m7, m12 > + paddd m8, m12 > + paddd m9, m12 > + psrad m6, %%sh > + psrad m7, %%sh > + psrad m8, %%sh > + psrad m9, %%sh > + packssdw m6, m7 > + packssdw m8, m9 > +%if %2 == 8 > + packuswb m6, m8 > + movu [voq+xq], m6 > +%else ; %2 != 8 > + pmaxsw m6, m11 > + pmaxsw m8, m11 > + pminsw m6, [pw_ %+ %%maxval] > + pminsw m8, [pw_ %+ %%maxval] > CLIPW (and a few other places) > + movu [voq+xq*2], m6 > + movu [voq+xq*2+mmsize], m8 > +%endif ; %2 ==/!= 8 > +%endif ; %3 == 0 > + > + pmaddwd m4, [rsp+0*mmsize] > + pmaddwd m5, [rsp+0*mmsize] ; uv_val > +%if %3 == 0 > + pmaddwd m2, [rsp+0*mmsize] > + pmaddwd m3, [rsp+0*mmsize] > +%endif ; %3 == 0 > + > + ; unpack y pixels with m15 (shifted round + offset), then multiply > + ; by m10, add uv pixels, and we're done! > +%if %3 == 1 > + punpckhdq m8, m4, m4 > + punpckldq m4, m4 > + punpckhdq m9, m5, m5 > + punpckldq m5, m5 > +%else ; %3 != 1 > + SWAP 8, 5, 2 > + SWAP 3, 9 > +%endif ; %3 ==/!= 1 > +%if %4 == 1 > + punpckhwd m6, m2, m15 > + punpcklwd m2, m15 > + punpckhwd m7, m3, m15 > + punpcklwd m3, m15 > + pmaddwd m2, m10 > + pmaddwd m6, m10 > + pmaddwd m3, m10 > + pmaddwd m7, m10 > + paddd m2, m4 > + paddd m6, m8 > + paddd m3, m5 > + paddd m7, m9 > + psrad m2, %%sh > + psrad m6, %%sh > + psrad m3, %%sh > + psrad m7, %%sh > + packssdw m2, m6 > + packssdw m3, m7 > + > + lea tmpq, [yoq+yosq] > +%if %2 == 8 > + packuswb m2, m3 > + movu [tmpq+xq*2], m2 > +%else ; %2 != 8 > + pmaxsw m2, m11 > + pmaxsw m3, m11 > + pminsw m2, [pw_ %+ %%maxval] > + pminsw m3, [pw_ %+ %%maxval] > + movu [tmpq+xq*4], m2 > + movu [tmpq+xq*4+mmsize], m3 > +%endif ; %2 ==/!= 8 > +%endif ; %4 == 1 > + > + punpckhwd m6, m0, m15 > + punpcklwd m0, m15 > + punpckhwd m7, m1, m15 > + punpcklwd m1, m15 > + pmaddwd m0, m10 > + pmaddwd m6, m10 > + pmaddwd m1, m10 > + pmaddwd m7, m10 > + paddd m0, m4 > + paddd m6, m8 > + paddd m1, m5 > + paddd m7, m9 > + psrad m0, %%sh > + psrad m6, %%sh > + psrad m1, %%sh > + psrad m7, %%sh > + packssdw m0, m6 > + packssdw m1, m7 > + > +%if %2 == 8 > + packuswb m0, m1 > + movu [yoq+xq*(1<<%3)], m0 > +%else ; %2 != 8 > + pmaxsw m0, m11 > + pmaxsw m1, m11 > + pminsw m0, [pw_ %+ %%maxval] > + pminsw m1, [pw_ %+ %%maxval] > + movu [yoq+xq*(2<<%3)], m0 > + movu [yoq+xq*(2<<%3)+mmsize], m1 > +%endif ; %2 ==/!= 8 > + > + add xq, mmsize >> %3 > + cmp xd, dword [rsp+3*mmsize+0] > + jl .loop_h > + > +%if %4 == 1 > + lea yiq, [yiq+yisq*2] > + lea yoq, [yoq+yosq*2] > +%else ; %4 != 1 > + add yiq, yisq > + add yoq, yosq > +%endif ; %4 ==/!= 1 > + add uiq, uisq > + add viq, visq > + add uoq, uosq > + add voq, vosq > + dec dword [rsp+3*mmsize+4] > + jg .loop_v > + > + RET > +%endmacro > + > +%macro YUV2YUV_FNS 2 ; ss_w, ss_h > +YUV2YUV_FN 8, 8, %1, %2 > +YUV2YUV_FN 10, 8, %1, %2 > +YUV2YUV_FN 12, 8, %1, %2 > +YUV2YUV_FN 8, 10, %1, %2 > +YUV2YUV_FN 10, 10, %1, %2 > +YUV2YUV_FN 12, 10, %1, %2 > +YUV2YUV_FN 8, 12, %1, %2 > +YUV2YUV_FN 10, 12, %1, %2 > +YUV2YUV_FN 12, 12, %1, %2 > +%endmacro > + > +INIT_XMM sse2 > +YUV2YUV_FNS 0, 0 > +YUV2YUV_FNS 1, 0 > +YUV2YUV_FNS 1, 1 > + > +; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride, > +; uint8_t *yuv[3], ptrdiff_t yuv_stride[3], > +; int w, int h, const int16_t > yuv2rgb_coeffs[3][3][8], > +; const int16_t yuv_offset[8]) > +%macro YUV2RGB_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert) > +%assign %%sh (%1 - 1) > +%assign %%rnd (1 << (%%sh - 1)) > +%assign %%uvoff (1 << (%1 - 1)) > +%if %2 == 0 > +%assign %%ss 444 > +%elif %3 == 0 > +%assign %%ss 422 > +%else ; %3 == 1 > +%assign %%ss 420 > +%endif ; %2/%3 > + > +cglobal yuv2rgb_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 8 * mmsize, \ > + rgb, rgbs, yuv, yuvs, ww, h, c, yoff > +%if %2 == 1 > + inc wwd > + sar wwd, 1 > +%endif ; %2 == 1 > +%if %3 == 1 > + inc hd > + sar hd, 1 > +%endif ; %3 == 1 > + pxor m11, m11 > + mova m15, [yoffq] ; yoff > + movh m14, [cq+ 0] ; cy > + movh m10, [cq+ 32] ; crv > + movh m13, [cq+112] ; cbu > + movh m12, [cq+ 64] ; cgu > + movh m9, [cq+ 80] ; cgv > + punpcklwd m14, [pw_ %+ %%rnd] ; cy, rnd > + punpcklwd m13, m11 ; cbu, 0 > + punpcklwd m11, m10 ; 0, crv > + punpcklwd m12, m9 ; cgu, cgv > + mova [rsp+0*mmsize], m11 > + mova [rsp+1*mmsize], m12 > + mova [rsp+2*mmsize], m13 > + mova [rsp+3*mmsize], m14 > + pxor m14, m14 > + > + DEFINE_ARGS r, rgbs, y, ys, ww, h, g, b, u, v, us, vs, x, tmp > + > + mov gq, [rq+1*gprsize] > + mov bq, [rq+2*gprsize] > + mov rq, [rq+0*gprsize] > + mov uq, [yq+1*gprsize] > + mov vq, [yq+2*gprsize] > + mov yq, [yq+0*gprsize] > + mov usq, [ysq+1*gprsize] > + mov vsq, [ysq+2*gprsize] > + mov ysq, [ysq+0*gprsize] > + > +.loop_v: > + xor xq, xq > + > +.loop_h: > +%if %3 == 1 > + lea tmpq, [yq+ysq] > +%endif ; %3 == 1 > +%if %1 == 8 > + movu m0, [yq+xq*(1<<%2)] > +%if %3 == 1 > + movu m2, [tmpq+xq*2] > +%endif ; %3 == 1 > +%if %2 == 1 > + movh m4, [uq+xq] > + movh m5, [vq+xq] > +%else ; %2 != 1 > + movu m4, [uq+xq] > + movu m5, [vq+xq] > +%endif ; %2 ==/!= 1 > + punpckhbw m1, m0, m14 > + punpcklbw m0, m14 > +%if %3 == 1 > + punpckhbw m3, m2, m14 > + punpcklbw m2, m14 > +%endif ; %3 == 1 > +%if %2 == 0 > + punpckhbw m2, m4, m14 > + punpckhbw m3, m5, m14 > +%endif ; %2 == 0 > + punpcklbw m4, m14 > + punpcklbw m5, m14 > +%else ; %1 != 8 > + movu m0, [yq+xq*(2<<%2)] > + movu m1, [yq+xq*(2<<%2)+mmsize] > +%if %3 == 1 > + movu m2, [tmpq+xq*4] > + movu m3, [tmpq+xq*4+mmsize] > +%endif ; %3 == 1 > + movu m4, [uq+xq*2] > + movu m5, [vq+xq*2] > +%if %2 == 0 > + movu m2, [uq+xq*2+mmsize] > + movu m3, [vq+xq*2+mmsize] > +%endif ; %2 == 0 > +%endif ; %1 ==/!= 8 > + psubw m0, m15 > + psubw m1, m15 > +%if %3 == 1 > + psubw m2, m15 > + psubw m3, m15 > +%endif ; %3 == 1 > + psubw m4, [pw_ %+ %%uvoff] > + psubw m5, [pw_ %+ %%uvoff] > + SBUTTERFLY wd, 4, 5, 6 > +%if %2 == 0 > + psubw m2, [pw_ %+ %%uvoff] > + psubw m3, [pw_ %+ %%uvoff] > + SBUTTERFLY wd, 2, 3, 6 > +%endif ; %2 == 0 > + > + ; calculate y+rnd full-resolution [0-3,6-9] > + punpckhwd m6, m0, [pw_1] ; y, 1 > + punpcklwd m0, [pw_1] ; y, 1 > + punpckhwd m7, m1, [pw_1] ; y, 1 > + punpcklwd m1, [pw_1] ; y, 1 > + pmaddwd m0, [rsp+3*mmsize] > + pmaddwd m6, [rsp+3*mmsize] > + pmaddwd m1, [rsp+3*mmsize] > + pmaddwd m7, [rsp+3*mmsize] > +%if %3 == 1 > + punpckhwd m8, m2, [pw_1] ; y, 1 > + punpcklwd m2, [pw_1] ; y, 1 > + punpckhwd m9, m3, [pw_1] ; y, 1 > + punpcklwd m3, [pw_1] ; y, 1 > + pmaddwd m2, [rsp+3*mmsize] > + pmaddwd m8, [rsp+3*mmsize] > + pmaddwd m3, [rsp+3*mmsize] > + pmaddwd m9, [rsp+3*mmsize] > + mova [rsp+4*mmsize], m2 > + mova [rsp+5*mmsize], m8 > + mova [rsp+6*mmsize], m3 > + mova [rsp+7*mmsize], m9 > +%endif ; %3 == 1 > + > + ; calculate r offsets (un-subsampled, then duplicate) > + pmaddwd m10, m4, [rsp+0*mmsize] > +%if %2 == 1 > + pmaddwd m12, m5, [rsp+0*mmsize] > + punpckhdq m11, m10, m10 > + punpckldq m10, m10 > + punpckhdq m13, m12, m12 > + punpckldq m12, m12 > +%else ; %2 != 1 > + pmaddwd m11, m5, [rsp+0*mmsize] > + pmaddwd m12, m2, [rsp+0*mmsize] > + pmaddwd m13, m3, [rsp+0*mmsize] > +%endif ; %2 ==/!= 1 > +%if %3 == 1 > + paddd m2, m10, [rsp+4*mmsize] > + paddd m3, m11, [rsp+5*mmsize] > + paddd m8, m12, [rsp+6*mmsize] > + paddd m9, m13, [rsp+7*mmsize] > +%endif > + paddd m10, m0 > + paddd m11, m6 > + paddd m12, m1 > + paddd m13, m7 > +%if %3 == 1 > + psrad m2, %%sh > + psrad m3, %%sh > + psrad m8, %%sh > + psrad m9, %%sh > +%endif ; %3 == 1 > + psrad m10, %%sh > + psrad m11, %%sh > + psrad m12, %%sh > + psrad m13, %%sh > +%if %3 == 1 > + lea tmpq, [rq+rgbsq*2] > + packssdw m2, m3 > + packssdw m8, m9 > + mova [tmpq+xq*4], m2 > + mova [tmpq+xq*4+mmsize], m8 > +%endif ; %3 == 1 > + packssdw m10, m11 > + packssdw m12, m13 > + mova [rq+xq*(2 << %2)], m10 > + mova [rq+xq*(2 << %2)+mmsize], m12 > + > + ; calculate g offsets (un-subsampled, then duplicate) > + pmaddwd m10, m4, [rsp+1*mmsize] > +%if %2 == 1 > + pmaddwd m12, m5, [rsp+1*mmsize] > + punpckhdq m11, m10, m10 > + punpckldq m10, m10 > + punpckhdq m13, m12, m12 > + punpckldq m12, m12 > +%else ; %2 != 1 > + pmaddwd m11, m5, [rsp+1*mmsize] > + pmaddwd m12, m2, [rsp+1*mmsize] > + pmaddwd m13, m3, [rsp+1*mmsize] > +%endif ; %2 ==/!= 1 > +%if %3 == 1 > + paddd m2, m10, [rsp+4*mmsize] > + paddd m3, m11, [rsp+5*mmsize] > + paddd m8, m12, [rsp+6*mmsize] > + paddd m9, m13, [rsp+7*mmsize] > +%endif ; %3 == 1 > + paddd m10, m0 > + paddd m11, m6 > + paddd m12, m1 > + paddd m13, m7 > +%if %3 == 1 > + psrad m2, %%sh > + psrad m3, %%sh > + psrad m8, %%sh > + psrad m9, %%sh > +%endif ; %3 == 1 > + psrad m10, %%sh > + psrad m11, %%sh > + psrad m12, %%sh > + psrad m13, %%sh > +%if %3 == 1 > + lea tmpq, [gq+rgbsq*2] > + packssdw m2, m3 > + packssdw m8, m9 > + mova [tmpq+xq*4], m2 > + mova [tmpq+xq*4+mmsize], m8 > +%endif ; %3 == 1 > + packssdw m10, m11 > + packssdw m12, m13 > + mova [gq+xq*(2 << %2)], m10 > + mova [gq+xq*(2 << %2)+mmsize], m12 > + > + ; calculate b offsets (un-subsampled, then duplicate) > + pmaddwd m4, [rsp+2*mmsize] > + pmaddwd m5, [rsp+2*mmsize] > +%if %2 == 1 > + punpckhdq m2, m4, m4 > + punpckldq m4, m4 > + punpckhdq m3, m5, m5 > + punpckldq m5, m5 > +%else ; %2 != 1 > + pmaddwd m2, [rsp+2*mmsize] > + pmaddwd m3, [rsp+2*mmsize] > + SWAP 2, 5 > +%endif ; %2 ==/!= 1 > + paddd m0, m4 > + paddd m6, m2 > + paddd m1, m5 > + paddd m7, m3 > +%if %3 == 1 > + paddd m4, [rsp+4*mmsize] > + paddd m2, [rsp+5*mmsize] > + paddd m5, [rsp+6*mmsize] > + paddd m3, [rsp+7*mmsize] > +%endif ; %3 == 1 > + psrad m0, %%sh > + psrad m6, %%sh > + psrad m1, %%sh > + psrad m7, %%sh > +%if %3 == 1 > + psrad m4, %%sh > + psrad m2, %%sh > + psrad m5, %%sh > + psrad m3, %%sh > +%endif ; %3 == 1 > + packssdw m0, m6 > + packssdw m1, m7 > + movu [bq+xq*(2 << %2)], m0 > + movu [bq+xq*(2 << %2)+mmsize], m1 > +%if %3 == 1 > + lea tmpq, [bq+rgbsq*2] > + packssdw m4, m2 > + packssdw m5, m3 > + movu [tmpq+xq*4], m4 > + movu [tmpq+xq*4+mmsize], m5 > +%endif ; %3 == 1 > + > + add xd, mmsize >> %2 > + cmp xd, wwd > + jl .loop_h > + > + lea rq, [rq+rgbsq*(2 << %3)] > + lea gq, [gq+rgbsq*(2 << %3)] > + lea bq, [bq+rgbsq*(2 << %3)] > +%if %3 == 1 > + lea yq, [yq+ysq*2] > +%else ; %3 != 0 > + add yq, ysq > +%endif ; %3 ==/!= 1 > + add uq, usq > + add vq, vsq > + dec hd > + jg .loop_v > + > + RET > +%endmacro > + > +%macro YUV2RGB_FNS 2 > +YUV2RGB_FN 8, %1, %2 > +YUV2RGB_FN 10, %1, %2 > +YUV2RGB_FN 12, %1, %2 > +%endmacro > + > +INIT_XMM sse2 > +YUV2RGB_FNS 0, 0 > +YUV2RGB_FNS 1, 0 > +YUV2RGB_FNS 1, 1 > + > +%macro RGB2YUV_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert) > +%assign %%sh 29 - %1 > +%assign %%rnd (1 << (%%sh - 15)) > +%assign %%uvrnd ((128 << (%1 - 8)) << (%%sh - 14)) > +%if %1 != 8 > +%assign %%maxval ((1 << %1) - 1) > +%endif ; %1 != 8 > +%if %2 == 0 > +%assign %%ss 444 > +%elif %3 == 0 > +%assign %%ss 422 > +%else ; %3 == 1 > +%assign %%ss 420 > +%endif ; %2/%3 > + > +cglobal rgb2yuv_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 6 * mmsize, \ > + yuv, yuvs, rgb, rgbs, ww, h, c, off > +%if %2 == 1 > + inc wwd > + sar wwd, 1 > +%endif ; %2 == 1 > +%if %3 == 1 > + inc hd > + sar hd, 1 > +%endif ; %3 == 1 > + > + ; prepare coeffs > + movh m8, [offq] > + movh m9, [pw_ %+ %%uvrnd] > + psllw m8, %%sh - 14 > + paddw m9, [pw_ %+ %%rnd] > + paddw m8, [pw_ %+ %%rnd] > + movh m0, [cq+ 0] > + movh m1, [cq+ 16] > + movh m2, [cq+ 32] > + movh m3, [cq+ 48] > + movh m4, [cq+ 64] > + movh m5, [cq+ 80] > + movh m6, [cq+112] > + movh m7, [cq+128] > + punpcklwd m0, m1 > + punpcklwd m2, m8 > + punpcklwd m3, m4 > + punpcklwd m4, m5, m9 > + punpcklwd m5, m6 > + punpcklwd m7, m9 > + > + mova [rsp+0*mmsize], m0 ; cry, cgy > + mova [rsp+1*mmsize], m2 ; cby, off + rnd > + mova [rsp+2*mmsize], m3 ; cru, cgu > + mova [rsp+3*mmsize], m4 ; cburv, uvoff + rnd > + mova [rsp+4*mmsize], m5 ; cburv, cgv > + mova [rsp+5*mmsize], m7 ; cbv, uvoff + rnd > + > + > + DEFINE_ARGS y, ys, r, rgbs, ww, h, u, v, us, vs, g, b, tmp, x > + mov gq, [rq+gprsize*1] > + mov bq, [rq+gprsize*2] > + mov rq, [rq+gprsize*0] > + mov uq, [yq+gprsize*1] > + mov vq, [yq+gprsize*2] > + mov yq, [yq+gprsize*0] > + mov usq, [ysq+gprsize*1] > + mov vsq, [ysq+gprsize*2] > + mov ysq, [ysq+gprsize*0] > + > + pxor m15, m15 > +.loop_v: > + xor xd, xd > + > +.loop_h: > + ; top line y > + mova m0, [rq+xq*(2<<%2)] > + mova m3, [rq+xq*(2<<%2)+mmsize] > + mova m1, [gq+xq*(2<<%2)] > + mova m4, [gq+xq*(2<<%2)+mmsize] > + mova m2, [bq+xq*(2<<%2)] > + mova m5, [bq+xq*(2<<%2)+mmsize] > + > + punpcklwd m6, m0, m1 > + punpckhwd m7, m0, m1 > + punpcklwd m8, m3, m4 > + punpckhwd m9, m3, m4 > + punpcklwd m10, m2, [pw_16384] > + punpckhwd m11, m2, [pw_16384] > + punpcklwd m12, m5, [pw_16384] > + punpckhwd m13, m5, [pw_16384] > + > + pmaddwd m6, [rsp+0*mmsize] > + pmaddwd m7, [rsp+0*mmsize] > + pmaddwd m8, [rsp+0*mmsize] > + pmaddwd m9, [rsp+0*mmsize] > + pmaddwd m10, [rsp+1*mmsize] > + pmaddwd m11, [rsp+1*mmsize] > + pmaddwd m12, [rsp+1*mmsize] > + pmaddwd m13, [rsp+1*mmsize] > + paddd m6, m10 > + paddd m7, m11 > + paddd m8, m12 > + paddd m9, m13 > + psrad m6, %%sh > + psrad m7, %%sh > + psrad m8, %%sh > + psrad m9, %%sh > + packssdw m6, m7 > + packssdw m8, m9 > +%if %1 == 8 > + packuswb m6, m8 > + movu [yq+xq*(1<<%2)], m6 > +%else > + pminsw m6, [pw_ %+ %%maxval] > + pminsw m8, [pw_ %+ %%maxval] > + pmaxsw m6, m15 > + pmaxsw m8, m15 > + movu [yq+xq*(2<<%2)], m6 > + movu [yq+xq*(2<<%2)+mmsize], m8 > +%endif > + > +%if %2 == 1 > + ; subsampling cached data > + pmaddwd m0, [pw_1] > + pmaddwd m1, [pw_1] > + pmaddwd m2, [pw_1] > + pmaddwd m3, [pw_1] > + pmaddwd m4, [pw_1] > + pmaddwd m5, [pw_1] > + > +%if %3 == 1 > + ; bottom line y, r/g portion only > + lea tmpq, [rgbsq+xq*2] > + mova m6, [rq+tmpq*2] > + mova m9, [rq+tmpq*2+mmsize] > + mova m7, [gq+tmpq*2] > + mova m10, [gq+tmpq*2+mmsize] > + mova m8, [bq+tmpq*2] > + mova m11, [bq+tmpq*2+mmsize] > + > + punpcklwd m12, m6, m7 > + punpckhwd m13, m6, m7 > + punpcklwd m14, m9, m10 > + punpckhwd m15, m9, m10 > + > + ; release two more registers > + pmaddwd m6, [pw_1] > + pmaddwd m7, [pw_1] > + pmaddwd m9, [pw_1] > + pmaddwd m10, [pw_1] > + paddd m0, m6 > + paddd m3, m9 > + paddd m1, m7 > + paddd m4, m10 > + > + ; bottom line y, b/rnd portion only > + punpcklwd m6, m8, [pw_16384] > + punpckhwd m7, m8, [pw_16384] > + punpcklwd m9, m11, [pw_16384] > + punpckhwd m10, m11, [pw_16384] > + > + pmaddwd m12, [rsp+0*mmsize] > + pmaddwd m13, [rsp+0*mmsize] > + pmaddwd m14, [rsp+0*mmsize] > + pmaddwd m15, [rsp+0*mmsize] > + pmaddwd m6, [rsp+1*mmsize] > + pmaddwd m7, [rsp+1*mmsize] > + pmaddwd m9, [rsp+1*mmsize] > + pmaddwd m10, [rsp+1*mmsize] > + paddd m12, m6 > + paddd m13, m7 > + paddd m14, m9 > + paddd m15, m10 > + psrad m12, %%sh > + psrad m13, %%sh > + psrad m14, %%sh > + psrad m15, %%sh > + packssdw m12, m13 > + packssdw m14, m15 > + lea tmpq, [yq+ysq] > +%if %1 == 8 > + packuswb m12, m14 > + movu [tmpq+xq*2], m12 > +%else > + pxor m15, m15 > + pminsw m12, [pw_ %+ %%maxval] > + pminsw m14, [pw_ %+ %%maxval] > + pmaxsw m12, m15 > + pmaxsw m14, m15 > + movu [tmpq+xq*4], m12 > + movu [tmpq+xq*4+mmsize], m14 > +%endif > + > + ; complete subsampling of r/g/b pixels for u/v > + pmaddwd m8, [pw_1] > + pmaddwd m11, [pw_1] > + paddd m2, m8 > + paddd m5, m11 > + paddd m0, [pd_2] > + paddd m1, [pd_2] > + paddd m2, [pd_2] > + paddd m3, [pd_2] > + paddd m4, [pd_2] > + paddd m5, [pd_2] > + psrad m0, 2 > + psrad m1, 2 > + psrad m2, 2 > + psrad m3, 2 > + psrad m4, 2 > + psrad m5, 2 > +%else ; %3 != 1 > + paddd m0, [pd_1] > + paddd m1, [pd_1] > + paddd m2, [pd_1] > + paddd m3, [pd_1] > + paddd m4, [pd_1] > + paddd m5, [pd_1] > + psrad m0, 1 > + psrad m1, 1 > + psrad m2, 1 > + psrad m3, 1 > + psrad m4, 1 > + psrad m5, 1 > +%endif ; %3 ==/!= 1 > + packssdw m0, m3 > + packssdw m1, m4 > + packssdw m2, m5 > +%endif ; %2 == 1 > + > + ; convert u/v pixels > + SBUTTERFLY wd, 0, 1, 6 > + punpckhwd m6, m2, [pw_16384] > + punpcklwd m2, [pw_16384] > + > + pmaddwd m7, m0, [rsp+2*mmsize] > + pmaddwd m8, m1, [rsp+2*mmsize] > + pmaddwd m9, m2, [rsp+3*mmsize] > + pmaddwd m10, m6, [rsp+3*mmsize] > + pmaddwd m0, [rsp+4*mmsize] > + pmaddwd m1, [rsp+4*mmsize] > + pmaddwd m2, [rsp+5*mmsize] > + pmaddwd m6, [rsp+5*mmsize] > + paddd m7, m9 > + paddd m8, m10 > + paddd m0, m2 > + paddd m1, m6 > + psrad m7, %%sh > + psrad m8, %%sh > + psrad m0, %%sh > + psrad m1, %%sh > + packssdw m7, m8 > + packssdw m0, m1 > +%if %2 == 1 > +%if %1 == 8 > + packuswb m7, m0 > + movh [uq+xq], m7 > + movhps [vq+xq], m7 > +%else > + pminsw m7, [pw_ %+ %%maxval] > + pminsw m0, [pw_ %+ %%maxval] > + pmaxsw m7, m15 > + pmaxsw m0, m15 > + movu [uq+xq*2], m7 > + movu [vq+xq*2], m0 > +%endif > +%else ; %2 != 1 > + ; second set of u/v pixels > + SBUTTERFLY wd, 3, 4, 6 > + punpckhwd m6, m5, [pw_16384] > + punpcklwd m5, [pw_16384] > + > + pmaddwd m8, m3, [rsp+2*mmsize] > + pmaddwd m9, m4, [rsp+2*mmsize] > + pmaddwd m10, m5, [rsp+3*mmsize] > + pmaddwd m11, m6, [rsp+3*mmsize] > + pmaddwd m3, [rsp+4*mmsize] > + pmaddwd m4, [rsp+4*mmsize] > + pmaddwd m5, [rsp+5*mmsize] > + pmaddwd m6, [rsp+5*mmsize] > + paddd m8, m10 > + paddd m9, m11 > + paddd m3, m5 > + paddd m4, m6 > + psrad m8, %%sh > + psrad m9, %%sh > + psrad m3, %%sh > + psrad m4, %%sh > + packssdw m8, m9 > + packssdw m3, m4 > + > +%if %1 == 8 > + packuswb m7, m8 > + packuswb m0, m3 > + movu [uq+xq], m7 > + movu [vq+xq], m0 > +%else > + pminsw m7, [pw_ %+ %%maxval] > + pminsw m0, [pw_ %+ %%maxval] > + pminsw m8, [pw_ %+ %%maxval] > + pminsw m3, [pw_ %+ %%maxval] > + pmaxsw m7, m15 > + pmaxsw m0, m15 > + pmaxsw m8, m15 > + pmaxsw m3, m15 > + movu [uq+xq*2], m7 > + movu [uq+xq*2+mmsize], m8 > + movu [vq+xq*2], m0 > + movu [vq+xq*2+mmsize], m3 > +%endif > +%endif ; %2 ==/!= 1 > + > + add xq, mmsize >> %2 > + cmp xd, wwd > + jl .loop_h > + > +%if %3 == 0 > + add yq, ysq > +%else ; %3 != 0 > + lea yq, [yq+ysq*2] > +%endif ; %3 ==/!= 0 > + add uq, usq > + add vq, vsq > + lea rq, [rq+rgbsq*(2<<%3)] > + lea gq, [gq+rgbsq*(2<<%3)] > + lea bq, [bq+rgbsq*(2<<%3)] > + dec hd > + jg .loop_v > + > + RET > +%endmacro > + > +%macro RGB2YUV_FNS 2 > +RGB2YUV_FN 8, %1, %2 > +RGB2YUV_FN 10, %1, %2 > +RGB2YUV_FN 12, %1, %2 > +%endmacro > + > +INIT_XMM sse2 > +RGB2YUV_FNS 0, 0 > +RGB2YUV_FNS 1, 0 > +RGB2YUV_FNS 1, 1 > + > +; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride, > +; int w, int h, const int16_t coeff[3][3][8]) > +INIT_XMM sse2 > +cglobal multiply3x3, 5, 7, 16, data, stride, ww, h, c > + movh m0, [cq+ 0] > + movh m1, [cq+ 32] > + movh m2, [cq+ 48] > + movh m3, [cq+ 80] > + movh m4, [cq+ 96] > + movh m5, [cq+128] > + punpcklwd m0, [cq+ 16] > + punpcklwd m1, [pw_8192] > + punpcklwd m2, [cq+ 64] > + punpcklwd m3, [pw_8192] > + punpcklwd m4, [cq+112] > + punpcklwd m5, [pw_8192] > + > + DEFINE_ARGS data0, stride, ww, h, data1, data2, x > + shl strideq, 1 > + mov data1q, [data0q+gprsize*1] > + mov data2q, [data0q+gprsize*2] > + mov data0q, [data0q+gprsize*0] > + > +.loop_v: > + xor xd, xd > + > +.loop_h: > + mova m6, [data0q+xq*2] > + mova m7, [data1q+xq*2] > + mova m8, [data2q+xq*2] > + SBUTTERFLY wd, 6, 7, 9 > + punpckhwd m9, m8, [pw_1] > + punpcklwd m8, [pw_1] > + > + pmaddwd m10, m6, m0 > + pmaddwd m11, m7, m0 > + pmaddwd m12, m8, m1 > + pmaddwd m13, m9, m1 > + paddd m10, m12 > + paddd m11, m13 > + psrad m10, 14 > + psrad m11, 14 > + > + pmaddwd m12, m6, m2 > + pmaddwd m13, m7, m2 > + pmaddwd m14, m8, m3 > + pmaddwd m15, m9, m3 > + paddd m12, m14 > + paddd m13, m15 > + psrad m12, 14 > + psrad m13, 14 > + > + pmaddwd m6, m4 > + pmaddwd m7, m4 > + pmaddwd m8, m5 > + pmaddwd m9, m5 > + paddd m6, m8 > + paddd m7, m9 > + psrad m6, 14 > + psrad m7, 14 > + > + packssdw m10, m11 > + packssdw m12, m13 > + packssdw m6, m7 > + > + mova [data0q+xq*2], m10 > + mova [data1q+xq*2], m12 > + mova [data2q+xq*2], m6 > + > + add xd, mmsize / 2 > + cmp xd, wwd > + jl .loop_h > + > + add data0q, strideq > + add data1q, strideq > + add data2q, strideq > + dec hd > + jg .loop_v > + > + RET > +%endif > diff --git a/libavfilter/x86/colorspacedsp_init.c > b/libavfilter/x86/colorspacedsp_init.c > new file mode 100644 > index 0000000..78d34bc > --- /dev/null > +++ b/libavfilter/x86/colorspacedsp_init.c > @@ -0,0 +1,119 @@ > +/* > + * Copyright (c) 2016 Ronald S. Bultje <rsbul...@gmail.com> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#include "libavutil/x86/cpu.h" > + > +#include "libavfilter/colorspacedsp.h" > + > +#define decl_yuv2yuv_fn(t) \ > +void ff_yuv2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t > yuv_out_stride[3], \ > + uint8_t *yuv_in[3], ptrdiff_t > yuv_in_stride[3], \ > + int w, int h, const int16_t > yuv2yuv_coeffs[3][3][8], \ > + const int16_t yuv_offset[2][8]) > + > +#define decl_yuv2yuv_fns(ss) \ > +decl_yuv2yuv_fn(ss##p8to8); \ > +decl_yuv2yuv_fn(ss##p10to8); \ > +decl_yuv2yuv_fn(ss##p12to8); \ > +decl_yuv2yuv_fn(ss##p8to10); \ > +decl_yuv2yuv_fn(ss##p10to10); \ > +decl_yuv2yuv_fn(ss##p12to10); \ > +decl_yuv2yuv_fn(ss##p8to12); \ > +decl_yuv2yuv_fn(ss##p10to12); \ > +decl_yuv2yuv_fn(ss##p12to12) > + > +decl_yuv2yuv_fns(420); > +decl_yuv2yuv_fns(422); > +decl_yuv2yuv_fns(444); > + > +#define decl_yuv2rgb_fn(t) \ > +void ff_yuv2rgb_##t##_sse2(int16_t *rgb_out[3], ptrdiff_t rgb_stride, \ > + uint8_t *yuv_in[3], ptrdiff_t yuv_stride[3], \ > + int w, int h, const int16_t coeff[3][3][8], \ > + const int16_t yuv_offset[8]) > + > +#define decl_yuv2rgb_fns(ss) \ > +decl_yuv2rgb_fn(ss##p8); \ > +decl_yuv2rgb_fn(ss##p10); \ > +decl_yuv2rgb_fn(ss##p12) > + > +decl_yuv2rgb_fns(420); > +decl_yuv2rgb_fns(422); > +decl_yuv2rgb_fns(444); > + > +#define decl_rgb2yuv_fn(t) \ > +void ff_rgb2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_stride[3], \ > + int16_t *rgb_in[3], ptrdiff_t rgb_stride, \ > + int w, int h, const int16_t coeff[3][3][8], \ > + const int16_t yuv_offset[8]) > + > +#define decl_rgb2yuv_fns(ss) \ > +decl_rgb2yuv_fn(ss##p8); \ > +decl_rgb2yuv_fn(ss##p10); \ > +decl_rgb2yuv_fn(ss##p12) > + > +decl_rgb2yuv_fns(420); > +decl_rgb2yuv_fns(422); > +decl_rgb2yuv_fns(444); > + > +void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride, int w, int h, > + const int16_t coeff[3][3][8]); > + > +void ff_colorspacedsp_x86_init(ColorSpaceDSPContext *dsp) > +{ > + int cpu_flags = av_get_cpu_flags(); > + > + if (ARCH_X86_64 && EXTERNAL_SSE2(cpu_flags)) { > +#define assign_yuv2yuv_fns(idx, ss) \ > + dsp->yuv2yuv[0][0][idx] = ff_yuv2yuv_##ss##p8to8_sse2; \ > + dsp->yuv2yuv[0][1][idx] = ff_yuv2yuv_##ss##p8to10_sse2; \ > + dsp->yuv2yuv[0][2][idx] = ff_yuv2yuv_##ss##p8to12_sse2; \ > + dsp->yuv2yuv[1][0][idx] = ff_yuv2yuv_##ss##p10to8_sse2; \ > + dsp->yuv2yuv[1][1][idx] = ff_yuv2yuv_##ss##p10to10_sse2; \ > + dsp->yuv2yuv[1][2][idx] = ff_yuv2yuv_##ss##p10to12_sse2; \ > + dsp->yuv2yuv[2][0][idx] = ff_yuv2yuv_##ss##p12to8_sse2; \ > + dsp->yuv2yuv[2][1][idx] = ff_yuv2yuv_##ss##p12to10_sse2; \ > + dsp->yuv2yuv[2][2][idx] = ff_yuv2yuv_##ss##p12to12_sse2 > + > + assign_yuv2yuv_fns(2, 420); > + assign_yuv2yuv_fns(1, 422); > + assign_yuv2yuv_fns(0, 444); > + > +#define assign_yuv2rgb_fns(idx, ss) \ > + dsp->yuv2rgb[0][idx] = ff_yuv2rgb_##ss##p8_sse2; \ > + dsp->yuv2rgb[1][idx] = ff_yuv2rgb_##ss##p10_sse2; \ > + dsp->yuv2rgb[2][idx] = ff_yuv2rgb_##ss##p12_sse2 > + > + assign_yuv2rgb_fns(2, 420); > + assign_yuv2rgb_fns(1, 422); > + assign_yuv2rgb_fns(0, 444); > + > +#define assign_rgb2yuv_fns(idx, ss) \ > + dsp->rgb2yuv[0][idx] = ff_rgb2yuv_##ss##p8_sse2; \ > + dsp->rgb2yuv[1][idx] = ff_rgb2yuv_##ss##p10_sse2; \ > + dsp->rgb2yuv[2][idx] = ff_rgb2yuv_##ss##p12_sse2 > + > + assign_rgb2yuv_fns(2, 420); > + assign_rgb2yuv_fns(1, 422); > + assign_rgb2yuv_fns(0, 444); > + > + dsp->multiply3x3 = ff_multiply3x3_sse2; > + } > +} > diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile > index c24e797..81a8b86 100644 > --- a/tests/checkasm/Makefile > +++ b/tests/checkasm/Makefile > @@ -16,6 +16,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) > > # libavfilter tests > AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o > +AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o > > CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes) > > diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c > index fb8defd..e4ca116 100644 > --- a/tests/checkasm/checkasm.c > +++ b/tests/checkasm/checkasm.c > @@ -106,6 +106,9 @@ static const struct { > #if CONFIG_BLEND_FILTER > { "vf_blend", checkasm_check_blend }, > #endif > + #if CONFIG_COLORSPACE_FILTER > + { "vf_colorspace", checkasm_check_colorspace }, > + #endif > #endif > { NULL } > }; > diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h > index 159a0a8..5a76f74 100644 > --- a/tests/checkasm/checkasm.h > +++ b/tests/checkasm/checkasm.h > @@ -33,6 +33,7 @@ > void checkasm_check_alacdsp(void); > void checkasm_check_blend(void); > void checkasm_check_bswapdsp(void); > +void checkasm_check_colorspace(void); > void checkasm_check_flacdsp(void); > void checkasm_check_fmtconvert(void); > void checkasm_check_h264pred(void); > diff --git a/tests/checkasm/vf_colorspace.c > b/tests/checkasm/vf_colorspace.c > new file mode 100644 > index 0000000..fcbb62a > --- /dev/null > +++ b/tests/checkasm/vf_colorspace.c > @@ -0,0 +1,314 @@ > +/* > + * Copyright (c) 2016 Ronald S. Bultje <rsbul...@gmail.com> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#include <string.h> > +#include "checkasm.h" > +#include "libavfilter/colorspacedsp.h" > +#include "libavutil/common.h" > +#include "libavutil/internal.h" > +#include "libavutil/intreadwrite.h" > + > +#define W 64 > +#define H 64 > + > +#define randomize_buffers() \ > + do { \ > + unsigned mask = bpp_mask[idepth]; \ > + int n, m; \ > + int bpp = 1 + (!!idepth); \ > + int buf_size = W * H * bpp; \ > + for (m = 0; m < 3; m++) { \ > + int ss = m ? ss_w + ss_h : 0; \ > + int plane_sz = buf_size >> ss; \ > + for (n = 0; n < plane_sz; n += 4) { \ > + unsigned r = rnd() & mask; \ > + AV_WN32A(&src[m][n], r); \ > + } \ > + } \ > + } while (0) > + > +static const char *format_string[] = { > + "444", "422", "420" > +}; > + > +static unsigned bpp_mask[] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff }; > + > +static void check_yuv2yuv(void) > +{ > + declare_func(void, uint8_t *dst[3], ptrdiff_t dst_stride[3], > + uint8_t *src[3], ptrdiff_t src_stride[3], > + int w, int h, const int16_t coeff[3][3][8], > + const int16_t off[2][8]); > + ColorSpaceDSPContext dsp; > + int idepth, odepth, fmt, n; > + LOCAL_ALIGNED_32(uint8_t, src_y, [W * H * 2]); > + LOCAL_ALIGNED_32(uint8_t, src_u, [W * H * 2]); > + LOCAL_ALIGNED_32(uint8_t, src_v, [W * H * 2]); > + uint8_t *src[3] = { src_y, src_u, src_v }; > + LOCAL_ALIGNED_32(uint8_t, dst0_y, [W * H * 2]); > + LOCAL_ALIGNED_32(uint8_t, dst0_u, [W * H * 2]); > + LOCAL_ALIGNED_32(uint8_t, dst0_v, [W * H * 2]); > + LOCAL_ALIGNED_32(uint8_t, dst1_y, [W * H * 2]); > + LOCAL_ALIGNED_32(uint8_t, dst1_u, [W * H * 2]); > + LOCAL_ALIGNED_32(uint8_t, dst1_v, [W * H * 2]); > + uint8_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y, > dst1_u, dst1_v }; > + LOCAL_ALIGNED_32(int16_t, offset_buf, [16]); > + LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]); > + int16_t (*offset)[8] = (int16_t(*)[8]) offset_buf; > + int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf; > + > + ff_colorspacedsp_init(&dsp); > + for (n = 0; n < 8; n++) { > + offset[0][n] = offset[1][n] = 16; > + > + coeff[0][0][n] = (1 << 14) + (1 << 7) + 1; > + coeff[0][1][n] = (1 << 7) - 1; > + coeff[0][2][n] = -(1 << 8); > + coeff[1][0][n] = coeff[2][0][n] = 0; > + coeff[1][1][n] = (1 << 14) + (1 << 7); > + coeff[1][2][n] = -(1 << 7); > + coeff[2][2][n] = (1 << 14) - (1 << 6); > + coeff[2][1][n] = 1 << 6; > + } > + for (idepth = 0; idepth < 3; idepth++) { > + for (odepth = 0; odepth < 3; odepth++) { > + for (fmt = 0; fmt < 3; fmt++) { > + if (check_func(dsp.yuv2yuv[idepth][odepth][fmt], > + "ff_colorspacedsp_yuv2yuv_%sp%dto%d", > + format_string[fmt], > + idepth * 2 + 8, odepth * 2 + 8)) { > + int ss_w = !!fmt, ss_h = fmt == 2; > + int y_src_stride = W << !!idepth, y_dst_stride = W << > !!odepth; > + int uv_src_stride = y_src_stride >> ss_w, > uv_dst_stride = y_dst_stride >> ss_w; > + > + randomize_buffers(); > + call_ref(dst0, (ptrdiff_t[3]) { y_dst_stride, > uv_dst_stride, uv_dst_stride }, > + src, (ptrdiff_t[3]) { y_src_stride, > uv_src_stride, uv_src_stride }, > + W, H, coeff, offset); > + call_new(dst1, (ptrdiff_t[3]) { y_dst_stride, > uv_dst_stride, uv_dst_stride }, > + src, (ptrdiff_t[3]) { y_src_stride, > uv_src_stride, uv_src_stride }, > + W, H, coeff, offset); > + if (memcmp(dst0[0], dst1[0], y_dst_stride * H) || > + memcmp(dst0[1], dst1[1], uv_dst_stride * H >> > ss_h) || > + memcmp(dst0[2], dst1[2], uv_dst_stride * H >> > ss_h)) { > + fail(); > + } > + } > + } > + } > + } > + > + report("yuv2yuv"); > +} > + > +static void check_yuv2rgb(void) > +{ > + declare_func(void, int16_t *dst[3], ptrdiff_t dst_stride, > + uint8_t *src[3], ptrdiff_t src_stride[3], > + int w, int h, const int16_t coeff[3][3][8], > + const int16_t off[8]); > + ColorSpaceDSPContext dsp; > + int idepth, fmt, n; > + LOCAL_ALIGNED_32(uint8_t, src_y, [W * H * 2]); > + LOCAL_ALIGNED_32(uint8_t, src_u, [W * H * 2]); > + LOCAL_ALIGNED_32(uint8_t, src_v, [W * H * 2]); > + uint8_t *src[3] = { src_y, src_u, src_v }; > + LOCAL_ALIGNED_32(int16_t, dst0_y, [W * H]); > + LOCAL_ALIGNED_32(int16_t, dst0_u, [W * H]); > + LOCAL_ALIGNED_32(int16_t, dst0_v, [W * H]); > + LOCAL_ALIGNED_32(int16_t, dst1_y, [W * H]); > + LOCAL_ALIGNED_32(int16_t, dst1_u, [W * H]); > + LOCAL_ALIGNED_32(int16_t, dst1_v, [W * H]); > + int16_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y, > dst1_u, dst1_v }; > + LOCAL_ALIGNED_32(int16_t, offset, [8]); > + LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]); > + int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf; > + > + ff_colorspacedsp_init(&dsp); > + for (n = 0; n < 8; n++) { > + offset[n] = 16; > + > + coeff[0][0][n] = coeff[1][0][n] = coeff[2][0][n] = (1 << 14) | 1; > + coeff[0][1][n] = coeff[2][2][n] = 0; > + coeff[0][2][n] = 1 << 13; > + coeff[1][1][n] = -(1 << 12); > + coeff[1][2][n] = 1 << 12; > + coeff[2][1][n] = 1 << 11; > + } > + for (idepth = 0; idepth < 3; idepth++) { > + for (fmt = 0; fmt < 3; fmt++) { > + if (check_func(dsp.yuv2rgb[idepth][fmt], > + "ff_colorspacedsp_yuv2rgb_%sp%d", > + format_string[fmt], idepth * 2 + 8)) { > + int ss_w = !!fmt, ss_h = fmt == 2; > + int y_src_stride = W << !!idepth; > + int uv_src_stride = y_src_stride >> ss_w; > + > + randomize_buffers(); > + call_ref(dst0, W, src, > + (ptrdiff_t[3]) { y_src_stride, uv_src_stride, > uv_src_stride }, > + W, H, coeff, offset); > + call_new(dst1, W, src, > + (ptrdiff_t[3]) { y_src_stride, uv_src_stride, > uv_src_stride }, > + W, H, coeff, offset); > + if (memcmp(dst0[0], dst1[0], W * H * sizeof(int16_t)) || > + memcmp(dst0[1], dst1[1], W * H * sizeof(int16_t)) || > + memcmp(dst0[2], dst1[2], W * H * sizeof(int16_t))) { > + fail(); > + } > + } > + } > + } > + > + report("yuv2rgb"); > +} > + > +#undef randomize_buffers > +#define randomize_buffers() \ > + do { \ > + int y, x, p; \ > + for (p = 0; p < 3; p++) { \ > + for (y = 0; y < H; y++) { \ > + for (x = 0; x < W; x++) { \ > + int r = rnd() & 0x7fff; \ > + r -= (32768 - 28672) >> 1; \ > + src[p][y * W + x] = r; \ > + } \ > + } \ > + } \ > + } while (0) > + > +static void check_rgb2yuv(void) > +{ > + declare_func(void, uint8_t *dst[3], ptrdiff_t dst_stride[3], > + int16_t *src[3], ptrdiff_t src_stride, > + int w, int h, const int16_t coeff[3][3][8], > + const int16_t off[8]); > + ColorSpaceDSPContext dsp; > + int odepth, fmt, n; > + LOCAL_ALIGNED_32(int16_t, src_y, [W * H * 2]); > + LOCAL_ALIGNED_32(int16_t, src_u, [W * H * 2]); > + LOCAL_ALIGNED_32(int16_t, src_v, [W * H * 2]); > + int16_t *src[3] = { src_y, src_u, src_v }; > + LOCAL_ALIGNED_32(uint8_t, dst0_y, [W * H]); > + LOCAL_ALIGNED_32(uint8_t, dst0_u, [W * H]); > + LOCAL_ALIGNED_32(uint8_t, dst0_v, [W * H]); > + LOCAL_ALIGNED_32(uint8_t, dst1_y, [W * H]); > + LOCAL_ALIGNED_32(uint8_t, dst1_u, [W * H]); > + LOCAL_ALIGNED_32(uint8_t, dst1_v, [W * H]); > + uint8_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y, > dst1_u, dst1_v }; > + LOCAL_ALIGNED_32(int16_t, offset, [8]); > + LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]); > + int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf; > + > + ff_colorspacedsp_init(&dsp); > + for (n = 0; n < 8; n++) { > + offset[n] = 16; > + > + // these somewhat resemble bt601/smpte170m coefficients > + coeff[0][0][n] = lrint(0.3 * (1 << 14)); > + coeff[0][1][n] = lrint(0.6 * (1 << 14)); > + coeff[0][2][n] = lrint(0.1 * (1 << 14)); > + coeff[1][0][n] = lrint(-0.15 * (1 << 14)); > + coeff[1][1][n] = lrint(-0.35 * (1 << 14)); > + coeff[1][2][n] = lrint(0.5 * (1 << 14)); > + coeff[2][0][n] = lrint(0.5 * (1 << 14)); > + coeff[2][1][n] = lrint(-0.42 * (1 << 14)); > + coeff[2][2][n] = lrint(-0.08 * (1 << 14)); > + } > + for (odepth = 0; odepth < 3; odepth++) { > + for (fmt = 0; fmt < 3; fmt++) { > + if (check_func(dsp.rgb2yuv[odepth][fmt], > + "ff_colorspacedsp_rgb2yuv_%sp%d", > + format_string[fmt], odepth * 2 + 8)) { > + int ss_w = !!fmt, ss_h = fmt == 2; > + int y_dst_stride = W << !!odepth; > + int uv_dst_stride = y_dst_stride >> ss_w; > + > + randomize_buffers(); > + call_ref(dst0, (ptrdiff_t[3]) { y_dst_stride, > uv_dst_stride, uv_dst_stride }, > + src, W, W, H, coeff, offset); > + call_new(dst1, (ptrdiff_t[3]) { y_dst_stride, > uv_dst_stride, uv_dst_stride }, > + src, W, W, H, coeff, offset); > + if (memcmp(dst0[0], dst1[0], H * y_dst_stride) || > + memcmp(dst0[1], dst1[1], H * uv_dst_stride >> ss_h) || > + memcmp(dst0[2], dst1[2], H * uv_dst_stride >> ss_h)) { > + fail(); > + } > + } > + } > + } > + > + report("rgb2yuv"); > +} > + > +static void check_multiply3x3(void) > +{ > + declare_func(void, int16_t *data[3], ptrdiff_t stride, > + int w, int h, const int16_t coeff[3][3][8]); > + ColorSpaceDSPContext dsp; > + LOCAL_ALIGNED_32(int16_t, dst0_y, [W * H]); > + LOCAL_ALIGNED_32(int16_t, dst0_u, [W * H]); > + LOCAL_ALIGNED_32(int16_t, dst0_v, [W * H]); > + LOCAL_ALIGNED_32(int16_t, dst1_y, [W * H]); > + LOCAL_ALIGNED_32(int16_t, dst1_u, [W * H]); > + LOCAL_ALIGNED_32(int16_t, dst1_v, [W * H]); > + int16_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y, > dst1_u, dst1_v }; > + int16_t **src = dst0; > + LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]); > + int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf; > + int n; > + > + ff_colorspacedsp_init(&dsp); > + for (n = 0; n < 8; n++) { > + coeff[0][0][n] = lrint(0.85 * (1 << 14)); > + coeff[0][1][n] = lrint(0.10 * (1 << 14)); > + coeff[0][2][n] = lrint(0.05 * (1 << 14)); > + coeff[1][0][n] = lrint(-0.1 * (1 << 14)); > + coeff[1][1][n] = lrint(0.95 * (1 << 14)); > + coeff[1][2][n] = lrint(0.15 * (1 << 14)); > + coeff[2][0][n] = lrint(-0.2 * (1 << 14)); > + coeff[2][1][n] = lrint(0.30 * (1 << 14)); > + coeff[2][2][n] = lrint(0.90 * (1 << 14)); > + } > + if (check_func(dsp.multiply3x3, "ff_colorspacedsp_multiply3x3")) { > + randomize_buffers(); > + memcpy(dst1_y, dst0_y, W * H * sizeof(*dst1_y)); > + memcpy(dst1_u, dst0_u, W * H * sizeof(*dst1_u)); > + memcpy(dst1_v, dst0_v, W * H * sizeof(*dst1_v)); > + call_ref(dst0, W, W, H, coeff); > + call_new(dst1, W, W, H, coeff); > + if (memcmp(dst0[0], dst1[0], H * W * sizeof(*dst0_y)) || > + memcmp(dst0[1], dst1[1], H * W * sizeof(*dst0_u)) || > + memcmp(dst0[2], dst1[2], H * W * sizeof(*dst0_v))) { > + fail(); > + } > + } > + > + report("multiply3x3"); > +} > + > +void checkasm_check_colorspace(void) > +{ > + check_yuv2yuv(); > + check_yuv2rgb(); > + check_rgb2yuv(); > + check_multiply3x3(); > +} > -- > 2.1.2 > Otherwise seems ok. Kieran _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel