I also added a makefile which assembles this file into libpostproc. I haven't yet modified the c code to use these functions yet. --- libpostproc/x86/Makefile | 1 + libpostproc/x86/deinterlace.asm | 167 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 libpostproc/x86/Makefile create mode 100644 libpostproc/x86/deinterlace.asm
diff --git a/libpostproc/x86/Makefile b/libpostproc/x86/Makefile new file mode 100644 index 0000000..06838ca --- /dev/null +++ b/libpostproc/x86/Makefile @@ -0,0 +1 @@ +YASM-OBJS-$(CONFIG_POSTPROC) += x86/deinterlace.o diff --git a/libpostproc/x86/deinterlace.asm b/libpostproc/x86/deinterlace.asm new file mode 100644 index 0000000..6e669bb --- /dev/null +++ b/libpostproc/x86/deinterlace.asm @@ -0,0 +1,167 @@ +;* +;* DeInterlacing filters written using SIMD extensions +;* Copyright (C) 2015 Tucker DiNapoli (T.Dinapoli at gmail.com) +;* +;* Adapted from inline assembly: +;* Copyright (C) 2001-2002 Michael Niedermayer (michae...@gmx.at) +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;* + +%include "PPUtil.asm" +;; All deinterlace functions operate on N 8x8 blocks at a time, where N +;; is the size (in bytes) of the simd registers being used divided +;; by 8, so 2 for xmm, and 4 for ymm. + +;; NOTE: The function names are camel case for compatibility with existing +;; postprocessing code, eventually the names of postprocessing functions +;; will be changed to use lowercase and underscores + +;; Deinterlace blocks using linear interpolation +;; Set each line 2n+1 to (line 2n + line 2n+2)/2 +%macro gen_deinterlace_interpolate_linear 0 +cglobal deInterlaceInterpolateLinear, 2, 4, 2;, src, stride + lea r1, [r1 + r2 * 4] + lea r3, [r1 + r2] + lea r4, [r3 + r2 * 4] + mova m0, [r1] + mova m1, [r3 + r2] + pavgb m0,m1 + mova [r3], m0 + mova m0, [r1 + r2 * 4] + pavgb m1,m0 + mova [r3 + r2 * 2],m1 + mova m1, [r4 + r2] + pavgb m0,m1 + mova [r4], m0 + mova m1, [r1 + r2 * 8] + pavgb m0,m1 + mova [r4 + r2 * 2], m0 + RET +%endmacro +;; Deinterlace blocks using cubic interpolation +;; Line 2n+1 = (9(2n) + 9(2n+2) - (2n-2) - (2n+4))/16 +%macro gen_deinterlace_interpolate_cubic 0 +cglobal deInterlaceInterpolateCubic, 2, 5, 5;, src, stride + lea r3, [r2 + r2 * 2] + add r1,r3 + lea r3, [r1 + r2] + lea r4, [r3 + r2 * 4] + lea r5, [r4 + r2 * 4] + pxor m4,m4 +%ifnmacro deint_cubic +;; given 5 lines a,b,c,d,e: a = c-3, b = c-1, d = c+1, e = c + 2 +;; set c = (9b + 9d - a - b)/16 +%macro deint_cubic 5 + mova m0,%1 + mova m1,%2 + mova m2,%4 + mova m3,%5 + pavgb m1,m2 ;(b+d)/2 + pavgb m0,m3 ;(a+e)/2 + ;; convert each byte into a word + mova m2,m1 + punpcklbw m1, m4 + punpckhbw m2, m4 + mova m0,m3 + punpcklbw m0, m4 + punpckhbw m3, m4 + ;; calculate the pixel values + psubw m0, m1 ;L(a+e - (b+d))/2 + psubw m3, m2 ;H(a+e - (b+d))/2 + psraw m0, 3 + psraw m3, 3 + psubw m1, m0 ;L(9(b+d) - (a+e))/16 + psubw m3, m2 ;H(9(b+d) - (a+e))/16 + ;; convert the words back into bytes using unsigned saturation + packuswb m1, m3 + mova %3, m1 +%endmacro +%endif + deint_cubic [r1], [r3 + r2], [r3 + r2 *2],\ + [r1 + r2 *4], [r4 + r2] + deint_cubic [r3 + r2], [r1 + r2 * 4], [r4],\ + [r4 + r2], [r1 + r2 * 8] + deint_cubic [r1 + r2 * 4], [r4 + r2], [r4 + r2 * 2],\ + [r1 + r2 * 8], [r5] + deint_cubic [r4 + r2], [r1 + r2 * 8], [r4 + r2 * 4],\ + [r5], [r5 + r2 * 2] + RET +%endmacro + +;; deinterlace blocks by seting every line n to (n-1 + 2n + n+1)/4 +%macro gen_deinterlace_blend_linear 0 +cglobal deInterlaceBlendLinear, 3, 5, 2 ;src, stride, tmp + lea r1, [r1 + r2 * 4] + lea r4, [r1 + r2] + lea r5, [r4 + r2 * 4] + mova m0, [r3] ;L0 + mova m1, [r4] ;L2 + mova m2, [r1] ;L1 + pavgb m0, m1 ;L0+L2 + pavgb m0, m2 ;L0 + 2L1 + L2 / 4 + mova [r1], m0 + mova m0, [r4 + r2 * 2] ;L3 + pavgb m2, m0 + pavgb m2, m1 + mova [r4], m2 ;L4 + mova m2, [r4 + r2 * 2] + pavgb m1, m2 + pavgb m1, m0 + mova [r4+r2], m1 ;L5 + mova m1, [r1 + r2 * 4] + pavgb m0, m1 + pavgb m0, m2 + mova [r4 + r2 * 2], m0 ;L6 + mova m0, [r5] + pavgb m2, m0 + pavgb m2, m1 + mova [r1 + r2 * 4], m2 ;L7 + mova m2, [r5 + r2] + pavgb m1, m2 + pavgb m1, m0 + mova [r5], m1 + mova m1, [r5 + r2 * 2] + pavgb m0, m1 + pavgb m0, m2 + mova [r5 + r2], m0 + mova m0, [r1 + r2 * 8] + pavgb m2, m0 + pavgb m2, m1 + mova [r5 + r2 * 2], m2 + mova [r3], m1 + RET +%endmacro +;; I'm not exactly sure how to insure the following only get built if +;; the specified instruction set is available. +;; If the INIT_XXX macros do that then great, otherwise I'll correct it +SECTION_TEXT + +INIT_MMX mmx2 +gen_deinterlace_interpolate_linear +gen_deinterlace_interpolate_cubic +gen_deinterlace_blend_linear + +INIT_XMM sse2 +gen_deinterlace_interpolate_linear +gen_deinterlace_interpolate_cubic +gen_deinterlace_blend_linear + +INIT_YMM avx2 +gen_deinterlace_interpolate_linear +gen_deinterlace_interpolate_cubic +gen_deinterlace_blend_linear -- 2.2.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel