Re: [FFmpeg-devel] [RFC PATCH] avfilter/fastdeint: import simple cpu-optimized deinterlacing algorithms from VLC

Paul B Mahol Mon, 09 Sep 2019 13:46:53 -0700

On 9/9/19, Aman Gupta <[email protected]> wrote:
> From: Aman Gupta <[email protected]>
>
> These are simple algorithms which can be run efficiently
> on low powered devices to produce deinteraced images.
>
> Signed-off-by: Aman Gupta <[email protected]>
> ---
>  doc/filters.texi                 |  27 ++
>  libavfilter/Makefile             |   1 +
>  libavfilter/aarch64/Makefile     |   1 +
>  libavfilter/aarch64/merge_neon.S |  98 ++++++
>  libavfilter/allfilters.c         |   1 +
>  libavfilter/arm/Makefile         |   3 +
>  libavfilter/arm/merge_armv6.S    |  70 ++++
>  libavfilter/arm/merge_neon.S     | 109 ++++++
>  libavfilter/vf_fastdeint.c       | 588 +++++++++++++++++++++++++++++++
>  9 files changed, 898 insertions(+)
>  create mode 100644 libavfilter/aarch64/merge_neon.S
>  create mode 100644 libavfilter/arm/Makefile
>  create mode 100644 libavfilter/arm/merge_armv6.S
>  create mode 100644 libavfilter/arm/merge_neon.S
>  create mode 100644 libavfilter/vf_fastdeint.c
>
> diff --git a/doc/filters.texi b/doc/filters.texi
> index 6c81e1da40..55d9adeb81 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -9796,6 +9796,33 @@ fade=t=in:st=5.5:d=0.5
>
>  @end itemize
>
> +@section fastdeint
> +Fast deinterlacing algorithms.
> +
> +@table @option
> +@item mode
> +Deinterlacing algorithm to use.
> +
> +It accepts the following values:
> +@table @samp
> +@item discard
> +Discard bottom frame.
> +
> +@item mean
> +Half resolution blender.
> +
> +@item blend
> +Full resolution blender.
> +
> +@item bob
> +Bob doubler.
> +
> +@item linear
> +Bob doubler with linear interpolation.
> +@end table
> +
> +@end table
> +
>  @section fftdnoiz
>  Denoise frames using 3D FFT (frequency domain filtering).
>
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 3ef4191d9a..a2b3566ec0 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -234,6 +234,7 @@ OBJS-$(CONFIG_EROSION_OPENCL_FILTER)         +=
> vf_neighbor_opencl.o opencl.o \
>                                                  opencl/neighbor.o
>  OBJS-$(CONFIG_EXTRACTPLANES_FILTER)          += vf_extractplanes.o
>  OBJS-$(CONFIG_FADE_FILTER)                   += vf_fade.o
> +OBJS-$(CONFIG_FASTDEINT_FILTER)              += vf_fastdeint.o
>  OBJS-$(CONFIG_FFTDNOIZ_FILTER)               += vf_fftdnoiz.o
>  OBJS-$(CONFIG_FFTFILT_FILTER)                += vf_fftfilt.o
>  OBJS-$(CONFIG_FIELD_FILTER)                  += vf_field.o
> diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
> index b58daa3a3f..2b0ad92893 100644
> --- a/libavfilter/aarch64/Makefile
> +++ b/libavfilter/aarch64/Makefile
> @@ -1,3 +1,4 @@
>  OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
>
> +NEON-OBJS-$(CONFIG_FASTDEINT_FILTER)         += aarch64/merge_neon.o
>  NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
> diff --git a/libavfilter/aarch64/merge_neon.S
> b/libavfilter/aarch64/merge_neon.S
> new file mode 100644
> index 0000000000..62377331a4
> --- /dev/null
> +++ b/libavfilter/aarch64/merge_neon.S
> @@ -0,0 +1,98 @@
> +/*
> + * Copyright (c) 2009-2016 Rémi Denis-Courmont, Janne Grunau, VLC authors
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +#define dest x0
> +#define src1 x1
> +#define src2 x2
> +#define size x3
> +
> +        .align 2
> +        // NOTE: Offset and pitch must be multiple of 16-bytes.
> +function ff_merge8_neon, export=1
> +        ands            x5, size, #~63
> +        b.eq            2f
> +        mov             x10, #64
> +        add             x11, src1, #32
> +        add             x12, src2, #32
> +1:
> +        ld1             {v0.16b,v1.16b}, [src1], x10
> +        ld1             {v4.16b,v5.16b}, [src2], x10
> +        ld1             {v2.16b,v3.16b}, [x11], x10
> +        uhadd           v0.16b, v0.16b, v4.16b
> +        ld1             {v6.16b,v7.16b}, [x12], x10
> +        subs            x5, x5, #64
> +        uhadd           v1.16b, v1.16b, v5.16b
> +        uhadd           v2.16b, v2.16b, v6.16b
> +        uhadd           v3.16b, v3.16b, v7.16b
> +        st1             {v0.16b,v1.16b}, [dest], #32
> +        st1             {v2.16b,v3.16b}, [dest], #32
> +        b.gt            1b
> +2:
> +        tbz             size, #5,  3f
> +        ld1             {v0.16b,v1.16b}, [src1], #32
> +        ld1             {v4.16b,v5.16b}, [src2], #32
> +        uhadd           v0.16b, v0.16b, v4.16b
> +        uhadd           v1.16b, v1.16b, v5.16b
> +        st1             {v0.16b,v1.16b}, [dest], #32
> +3:
> +        tbz             size, #4, 4f
> +        ld1             {v0.16b}, [src1]
> +        ld1             {v4.16b}, [src2]
> +        uhadd           v0.16b, v0.16b, v4.16b
> +        st1             {v0.16b}, [dest]
> +4:
> +        ret
> +endfunc
> +
> +        .align 2
> +function ff_merge16_neon, export=1
> +        ands            x5, size, #~63
> +        b.eq            2f
> +1:
> +        ld1             {v0.8h,v1.8h}, [src1], #32
> +        ld1             {v4.8h,v5.8h}, [src2], #32
> +        ld1             {v2.8h,v3.8h}, [src1], #32
> +        uhadd           v0.8h, v0.8h, v4.8h
> +        ld1             {v6.8h,v7.8h}, [src2], #32
> +        uhadd           v1.8h, v1.8h, v5.8h
> +        uhadd           v2.8h, v2.8h, v6.8h
> +        uhadd           v3.8h, v3.8h, v7.8h
> +        st1             {v0.8h,v1.8h}, [dest], #32
> +        st1             {v2.8h,v3.8h}, [dest], #32
> +        subs            x5, x5, #64
> +        b.gt            1b
> +2:
> +        tbz             size, #5, 3f
> +        ld1             {v0.8h,v1.8h}, [src1], #32
> +        ld1             {v4.8h,v5.8h}, [src2], #32
> +        uhadd           v0.8h, v0.8h, v4.8h
> +        uhadd           v1.8h, v1.8h, v5.8h
> +        st1             {v0.8h,v1.8h}, [dest], #32
> +3:
> +        tbz             size, #4,  4f
> +        ld1             {v0.8h}, [src1]
> +        ld1             {v4.8h}, [src2]
> +        uhadd           v0.8h, v0.8h,v4.8h
> +        st1             {v0.8h}, [dest]
> +4:
> +        ret
> +endfunc
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index b675c688ee..6631af2ffe 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -219,6 +219,7 @@ extern AVFilter ff_vf_erosion;
>  extern AVFilter ff_vf_erosion_opencl;
>  extern AVFilter ff_vf_extractplanes;
>  extern AVFilter ff_vf_fade;
> +extern AVFilter ff_vf_fastdeint;
>  extern AVFilter ff_vf_fftdnoiz;
>  extern AVFilter ff_vf_fftfilt;
>  extern AVFilter ff_vf_field;
> diff --git a/libavfilter/arm/Makefile b/libavfilter/arm/Makefile
> new file mode 100644
> index 0000000000..c92d62fac9
> --- /dev/null
> +++ b/libavfilter/arm/Makefile
> @@ -0,0 +1,3 @@
> +ARMV6-OBJS-$(CONFIG_FASTDEINT_FILTER)  += arm/merge_armv6.o
> +
> +NEON-OBJS-$(CONFIG_FASTDEINT_FILTER)   += arm/merge_neon.o
> diff --git a/libavfilter/arm/merge_armv6.S b/libavfilter/arm/merge_armv6.S
> new file mode 100644
> index 0000000000..9b551c2c6c
> --- /dev/null
> +++ b/libavfilter/arm/merge_armv6.S
> @@ -0,0 +1,70 @@
> +/*
> + * Copyright (c) 2009-2012 Rémi Denis-Courmont, VLC authors
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include "libavutil/arm/asm.S"
> +
> +#define dest r0
> +#define src1 r1
> +#define src2 r2
> +#define size r3
> +
> +        .align 2
> +function ff_merge8_armv6, export=1
> +        push            {r4-r9,lr}
> +1:
> +        pld             [src1, #64]
> +        ldm             src1!, {r4-r5}
> +        pld             [src2, #64]
> +        ldm             src2!, {r8-r9}
> +        subs            size, size, #16
> +        uhadd8          r4, r4, r8
> +        ldm             src1!, {r6-r7}
> +        uhadd8          r5, r5, r9
> +        ldm             src2!, {ip,lr}
> +        uhadd8          r6, r6, ip
> +        stm             dest!, {r4-r5}
> +        uhadd8          r7, r7, lr
> +        stm             dest!, {r6-r7}
> +        it              eq
> +        popeq           {r4-r9,pc}
> +        b               1b
> +endfunc
> +
> +        .align 2
> +function ff_merge16_armv6, export=1
> +        push            {r4-r9,lr}
> +1:
> +        pld             [src1, #64]
> +        ldm             src1!, {r4-r5}
> +        pld             [src2, #64]
> +        ldm             src2!, {r8-r9}
> +        subs            size, size, #16
> +        uhadd16         r4, r4, r8
> +        ldm             src1!, {r6-r7}
> +        uhadd16         r5, r5, r9
> +        ldm             src2!, {ip,lr}
> +        uhadd16         r6, r6, ip
> +        stm             dest!, {r4-r5}
> +        uhadd16         r7, r7, lr
> +        stm             dest!, {r6-r7}
> +        it              eq
> +        popeq           {r4-r9,pc}
> +        b               1b
> +endfunc
> \ No newline at end of file
> diff --git a/libavfilter/arm/merge_neon.S b/libavfilter/arm/merge_neon.S
> new file mode 100644
> index 0000000000..ae36cf3ca9
> --- /dev/null
> +++ b/libavfilter/arm/merge_neon.S
> @@ -0,0 +1,109 @@
> +/*
> + * Copyright (c) 2009-2012 Rémi Denis-Courmont, VLC authors
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include "libavutil/arm/asm.S"
> +
> +#define dest r0
> +#define src1 r1
> +#define src2 r2
> +#define size r3
> +
> +        .align 2
> +        @ NOTE: Offset and pitch must be multiple of 16-bytes.
> +function ff_merge8_neon, export=1
> +        cmp             size, #64
> +        blo             2f
> +1:
> +        pld             [src1, #64]
> +        vld1.u8         {q0-q1}, [src1,:128]!
> +        pld             [src2, #64]
> +        vld1.u8         {q8-q9}, [src2,:128]!
> +        vhadd.u8        q0, q0, q8
> +        sub             size, size, #64
> +        vld1.u8         {q2-q3}, [src1,:128]!
> +        vhadd.u8        q1, q1, q9
> +        vld1.u8         {q10-q11}, [src2,:128]!
> +        vhadd.u8        q2, q2, q10
> +        cmp             size, #64
> +        vhadd.u8        q3, q3, q11
> +        vst1.u8         {q0-q1}, [dest,:128]!
> +        vst1.u8         {q2-q3}, [dest,:128]!
> +        bhs             1b
> +2:
> +        cmp             size, #32
> +        blo             3f
> +        vld1.u8         {q0-q1}, [src1,:128]!
> +        sub             size, size, #32
> +        vld1.u8         {q8-q9}, [src2,:128]!
> +        vhadd.u8        q0, q0, q8
> +        vhadd.u8        q1, q1, q9
> +        vst1.u8         {q0-q1}, [dest,:128]!
> +3:
> +        cmp             size, #16
> +        it              lo
> +        bxlo            lr
> +        vld1.u8         {q0}, [src1,:128]!
> +        sub             size, size, #16
> +        vld1.u8         {q8}, [src2,:128]!
> +        vhadd.u8        q0, q0, q8
> +        vst1.u8         {q0}, [dest,:128]!
> +        bx              lr
> +endfunc
> +
> +        .align 2
> +function ff_merge16_neon, export=1
> +        cmp             size, #64
> +        blo             2f
> +1:
> +        pld             [src1, #64]
> +        vld1.u16        {q0-q1}, [src1,:128]!
> +        pld             [src2, #64]
> +        vld1.u16        {q8-q9}, [src2,:128]!
> +        vhadd.u16       q0, q0, q8
> +        sub             size, size, #64
> +        vld1.u16        {q2-q3}, [src1,:128]!
> +        vhadd.u16       q1, q1, q9
> +        vld1.u16        {q10-q11}, [src2,:128]!
> +        vhadd.u16       q2, q2, q10
> +        cmp             size, #64
> +        vhadd.u16       q3, q3, q11
> +        vst1.u16        {q0-q1}, [dest,:128]!
> +        vst1.u16        {q2-q3}, [dest,:128]!
> +        bhs             1b
> +2:
> +        cmp             size, #32
> +        blo             3f
> +        vld1.u16        {q0-q1}, [src1,:128]!
> +        sub             size, size, #32
> +        vld1.u16        {q8-q9}, [src2,:128]!
> +        vhadd.u16       q0, q0, q8
> +        vhadd.u16       q1, q1, q9
> +        vst1.u16        {q0-q1}, [dest,:128]!
> +3:
> +        cmp             size, #16
> +        it              lo
> +        bxlo            lr
> +        vld1.u16        {q0}, [src1,:128]!
> +        sub             size, size, #16
> +        vld1.u16        {q8}, [src2,:128]!
> +        vhadd.u16       q0, q0, q8
> +        vst1.u16        {q0}, [dest,:128]!
> +        bx              lr
> +endfunc
> \ No newline at end of file
> diff --git a/libavfilter/vf_fastdeint.c b/libavfilter/vf_fastdeint.c
> new file mode 100644
> index 0000000000..5ddd8be392
> --- /dev/null
> +++ b/libavfilter/vf_fastdeint.c
> @@ -0,0 +1,588 @@
> +/*
> + * Copyright (C) 2015 Aman Gupta <[email protected]>
> + *               2000-2011 VLC authors and VideoLAN
> + *
> + * Author: Sam Hocevar <[email protected]>
> + *         Damien Lucas <[email protected]>
> + *         Laurent Aimar <[email protected]>
> + *         Sigmund Augdal Helberg <[email protected]>
> + *
> + * These algorithms are derived from the VLC project's
> + * modules/video_filter/deinterlace/algo_basic.c
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/common.h"
> +#include "libavutil/opt.h"
> +#include "libavutil/pixdesc.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/timestamp.h"
> +#include "avfilter.h"
> +#include "formats.h"
> +#include "internal.h"
> +#include "video.h"
> +
> +enum Mode {
> +  MODE_DISCARD,
> +  MODE_MEAN,
> +  MODE_BLEND,
> +  MODE_BOB,
> +  MODE_LINEAR,
> +  MODE_MAX,
> +};
> +
> +typedef void (*merge_fn)(void *dst, const void *src1, const void *src2,
> size_t len);
> +
> +typedef struct FastDeintContext {
> +    const AVClass *class;
> +    merge_fn merge;
> +    int merge_size;
> +    int merge_aligned;
> +    AVFrame *cur, *next;
> +    enum Mode mode;
> +    int eof;
> +} FastDeintContext;
> +
> +static void merge8_c(uint8_t *dst, const uint8_t *src1, const uint8_t
> *src2, size_t bytes)
> +{
> +    for (; bytes > 0; bytes--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +}
> +
> +static void merge16_c(uint16_t *dst, const uint16_t *src1, const uint16_t
> *src2, size_t bytes)
> +{
> +    for (size_t words = bytes / 2; words > 0; words--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +}
> +
> +static void merge8_unaligned(FastDeintContext *s, uint8_t *dst, const
> uint8_t *src1, const uint8_t *src2, size_t bytes)
> +{
> +    if (s->merge_aligned) {
> +        size_t remainder = bytes % 16;
> +        if (remainder > 0) {
> +            merge8_c(dst, src1, src2, remainder);
> +            bytes -= remainder;
> +            dst += remainder;
> +            src1 += remainder;
> +            src2 += remainder;
> +        }
> +    }
> +    s->merge(dst, src1, src2, bytes);
> +}
> +
> +static void merge16_unaligned(FastDeintContext *s, uint16_t *dst, const
> uint16_t *src1, const uint16_t *src2, size_t bytes)
> +{
> +    if (s->merge_aligned) {
> +        size_t words = bytes / 2;
> +        size_t remainder = words % 8;
> +        if (remainder > 0) {
> +            merge16_c(dst, src1, src2, remainder);
> +            words -= remainder;
> +            dst += remainder;
> +            src1 += remainder;
> +            src2 += remainder;
> +        }
> +    }
> +    s->merge(dst, src1, src2, bytes);
> +}
> +
> +static void merge_unaligned(FastDeintContext *s, void *dst, const void
> *src1, const void *src2, size_t bytes)
> +{
> +    if (s->merge_size == 16)
> +        merge16_unaligned(s, dst, src1, src2, bytes);
> +    else
> +        merge8_unaligned(s, dst, src1, src2, bytes);
> +}
> +
> +#if HAVE_SSE2_INLINE && defined(__x86_64__)
> +static void merge8_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t
> *src2, size_t bytes)
> +{
> +    for(; bytes > 0 && ((uintptr_t)src1 & 15); bytes--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +
> +    for (; bytes >= 16; bytes -= 16) {
> +        __asm__  __volatile__( "movdqu %2,%%xmm1;"
> +                               "pavgb %1, %%xmm1;"
> +                               "movdqu %%xmm1, %0" :"=m" (*dst):
> +                                                 "m" (*src1),
> +                                                 "m" (*src2) : "xmm1" );
> +        dst += 16;
> +        src1 += 16;
> +        src2 += 16;
> +    }
> +
> +    if (bytes > 0) {
> +        merge8_c(dst, src1, src2, bytes);
> +    }
> +}
> +static void merge16_sse2(uint16_t *dst, const uint16_t *src1, const
> uint16_t *src2, size_t bytes)
> +{
> +    size_t words = bytes / 2;
> +
> +    for(; words > 0 && ((uintptr_t)src1 & 15); words--)
> +        *dst++ = ( *src1++ + *src2++ ) >> 1;
> +
> +    for (; words >= 8; words -= 8) {
> +        __asm__  __volatile__( "movdqu %2,%%xmm1;"
> +                               "pavgw %1, %%xmm1;"
> +                               "movdqu %%xmm1, %0" :"=m" (*dst):
> +                                                 "m" (*src1),
> +                                                 "m" (*src2) : "xmm1" );
> +        dst += 8;
> +        src1 += 8;
> +        src2 += 8;
> +    }
> +


Unacceptable code. Inline assembly is forbidden.

> +    if (words > 0) {
> +        merge16_c(dst, src1, src2, words * 2);
> +    }
> +}
> +#define merge8 merge8_sse2
> +#define merge16 merge16_sse2
> +#else
> +#define merge8 merge8_c
> +#define merge16 merge16_c
> +#endif
> +
> +static void render_image_single(FastDeintContext *s, AVFrame *out, AVFrame
> *frame)
> +{
> +    int i, planes_nb = 0;
> +    enum Mode mode = s->mode;
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out->format);
> +
> +    for (i = 0; i < desc->nb_components; i++)
> +        planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
> +
> +    for (i = 0; i < planes_nb; i++) {
> +        int height, bwidth;
> +        int dst_linesize, src_linesize;
> +        const uint8_t *src;
> +        uint8_t *dst;
> +
> +        bwidth = av_image_get_linesize(out->format, out->width, i);
> +        if (bwidth < 0) {
> +            av_log(s, AV_LOG_ERROR, "av_image_get_linesize failed\n");
> +            return;
> +        }
> +
> +        height = out->height;
> +        if (i == 1 || i == 2) {
> +            height = FF_CEIL_RSHIFT(out->height, desc->log2_chroma_h);
> +        }
> +
> +        src = frame->data[i];
> +        dst = out->data[i];
> +        dst_linesize = out->linesize[i];
> +        src_linesize = frame->linesize[i];
> +
> +        if (mode == MODE_BLEND) {
> +            // Copy first line
> +            memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +            height--;
> +        }
> +
> +        // Merge remaining lines
> +        for (; height > 0; height--) {
> +            if (mode == MODE_DISCARD)
> +                memcpy(dst, src, bwidth);
> +            else
> +                merge_unaligned(s, dst, src, src + src_linesize, bwidth);
> +            dst += dst_linesize;
> +            src += src_linesize;
> +            if (mode == MODE_MEAN || mode == MODE_DISCARD) {
> +                src += src_linesize;
> +                height--;
> +            }
> +        }
> +    }
> +    if (mode != MODE_DISCARD)
> +        emms_c();
> +}
> +
> +static void render_image_doubler(FastDeintContext *s, AVFrame *out, AVFrame
> *frame, int field)
> +{
> +    int i, planes_nb = 0;
> +    enum Mode mode = s->mode;
> +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(out->format);
> +
> +    for (i = 0; i < desc->nb_components; i++)
> +        planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
> +
> +    for (i = 0; i < planes_nb; i++) {
> +        int height, bwidth;
> +        int dst_linesize, src_linesize;
> +        const uint8_t *src;
> +        uint8_t *dst;
> +
> +        bwidth = av_image_get_linesize(out->format, out->width, i);
> +        if (bwidth < 0) {
> +            av_log(s, AV_LOG_ERROR, "av_image_get_linesize failed\n");
> +            return;
> +        }
> +        height = out->height;
> +        if (i == 1 || i == 2) {
> +            height = FF_CEIL_RSHIFT(out->height, desc->log2_chroma_h);
> +        }
> +
> +        src = frame->data[i];
> +        dst = out->data[i];
> +        src_linesize = frame->linesize[i];
> +        dst_linesize = out->linesize[i];
> +
> +        // For BOTTOM field we need to add the first line
> +        if (field == 1) {
> +            memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +            src += src_linesize;
> +            height--;
> +        }
> +
> +        height -= 2;
> +
> +        for (; height > 0; height-=2) {
> +            memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +
> +            if (mode == MODE_LINEAR)
> +                merge_unaligned(s, dst, src, src + 2 * src_linesize,
> bwidth);
> +            else
> +                memcpy(dst, src, bwidth);
> +            dst += dst_linesize;
> +
> +            src += src_linesize * 2;
> +        }
> +
> +        memcpy(dst, src, bwidth);
> +
> +        // For TOP field we need to add the last line
> +        if (field == 0)
> +        {
> +            dst += dst_linesize;
> +            src += src_linesize;
> +            memcpy(dst, src, bwidth);
> +        }
> +    }
> +    if (mode == MODE_LINEAR)
> +        emms_c();
> +}
> +
> +static int filter_frame_single(AVFilterLink *link, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    AVFrame *out;
> +    FastDeintContext *s = ctx->priv;
> +
> +    if (!frame->interlaced_frame) {
> +        return ff_filter_frame(ctx->outputs[0], frame);
> +    }
> +
> +    out = ff_get_video_buffer(ctx->outputs[0], link->w, link->h);
> +    if (!out) {
> +        av_frame_free(&frame);
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    av_frame_copy_props(out, frame);
> +    out->interlaced_frame = 0;
> +    render_image_single(s, out, frame);
> +
> +    av_frame_free(&frame);
> +    return ff_filter_frame(ctx->outputs[0], out);
> +}
> +
> +static AVFrame *copy_frame(AVFilterLink *link, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    AVFrame *out;
> +
> +    if (frame->format == AV_PIX_FMT_VIDEOTOOLBOX)
> +        out = av_frame_alloc();
> +    else
> +        out = ff_get_video_buffer(ctx->outputs[0], link->w, link->h);
> +
> +    if (!out)
> +        return NULL;
> +
> +    av_frame_copy_props(out, frame);
> +    return out;
> +}
> +
> +static int filter_frame_double(AVFilterLink *link, AVFrame *in)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    FastDeintContext *s = ctx->priv;
> +    AVFrame *frame, *out, *out2;
> +    int tff, ret;
> +
> +    s->cur = s->next;
> +    s->next = in;
> +
> +    if (!s->cur) {
> +        return 0;
> +    }
> +
> +    frame = s->cur;
> +
> +    if (!frame->interlaced_frame) {
> +        if (frame->pts != AV_NOPTS_VALUE)
> +            frame->pts *= 2;
> +        s->cur = NULL;
> +        return ff_filter_frame(ctx->outputs[0], frame);
> +    }
> +
> +    tff = frame->top_field_first;
> +    out = copy_frame(link, frame);
> +    if (!out) {
> +        av_frame_free(&frame);
> +        s->cur = NULL;
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    out->interlaced_frame = 0;
> +    if (out->pts != AV_NOPTS_VALUE)
> +        out->pts = out->pts * 2;
> +    render_image_doubler(s, out, frame, !tff);
> +
> +    ret = ff_filter_frame(ctx->outputs[0], out);
> +    if (ret < 0) {
> +        av_frame_free(&frame);
> +        s->cur = NULL;
> +        return ret;
> +    }
> +
> +    out2 = copy_frame(link, frame);
> +    if (!out2) {
> +        av_frame_free(&frame);
> +        s->cur = NULL;
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    out2->interlaced_frame = 0;
> +    av_frame_remove_side_data(out2, AV_FRAME_DATA_A53_CC);
> +    if (out2->pts != AV_NOPTS_VALUE) {
> +        out2->pts = frame->pts + s->next->pts;
> +    }
> +    render_image_doubler(s, out2, frame, tff);
> +
> +    av_frame_free(&frame);
> +    s->cur = NULL;
> +
> +    return ff_filter_frame(ctx->outputs[0], out2);
> +}
> +
> +static int filter_frame(AVFilterLink *link, AVFrame *frame)
> +{
> +    AVFilterContext *ctx = link->dst;
> +    FastDeintContext *s = ctx->priv;
> +
> +    av_assert0(frame);
> +
> +    if (s->mode == MODE_LINEAR || s->mode == MODE_BOB) {
> +        return filter_frame_double(link, frame);
> +    } else {
> +        return filter_frame_single(link, frame);
> +    }
> +}
> +
> +static av_cold void uninit(AVFilterContext *ctx)
> +{
> +    FastDeintContext *s = ctx->priv;
> +    av_frame_free(&s->cur);
> +    av_frame_free(&s->next);
> +}
> +
> +static int query_formats(AVFilterContext *ctx)
> +{
> +    static const enum AVPixelFormat pix_fmts[] = {
> +        AV_PIX_FMT_YUV420P,
> +        AV_PIX_FMT_YUV422P,
> +        AV_PIX_FMT_YUV444P,
> +        AV_PIX_FMT_YUV410P,
> +        AV_PIX_FMT_YUV411P,
> +        AV_PIX_FMT_GRAY8,
> +        AV_PIX_FMT_YUVJ420P,
> +        AV_PIX_FMT_YUVJ422P,
> +        AV_PIX_FMT_YUVJ444P,
> +        AV_PIX_FMT_GRAY16,
> +        AV_PIX_FMT_YUV440P,
> +        AV_PIX_FMT_YUVJ440P,
> +        AV_PIX_FMT_YUV420P9,
> +        AV_PIX_FMT_YUV422P9,
> +        AV_PIX_FMT_YUV444P9,
> +        AV_PIX_FMT_YUV420P10,
> +        AV_PIX_FMT_YUV422P10,
> +        AV_PIX_FMT_YUV444P10,
> +        AV_PIX_FMT_YUV420P12,
> +        AV_PIX_FMT_YUV422P12,
> +        AV_PIX_FMT_YUV444P12,
> +        AV_PIX_FMT_YUV420P14,
> +        AV_PIX_FMT_YUV422P14,
> +        AV_PIX_FMT_YUV444P14,
> +        AV_PIX_FMT_YUV420P16,
> +        AV_PIX_FMT_YUV422P16,
> +        AV_PIX_FMT_YUV444P16,
> +        AV_PIX_FMT_YUVA420P,
> +        AV_PIX_FMT_YUVA422P,
> +        AV_PIX_FMT_YUVA444P,
> +        AV_PIX_FMT_GBRP,
> +        AV_PIX_FMT_GBRP9,
> +        AV_PIX_FMT_GBRP10,
> +        AV_PIX_FMT_GBRP12,
> +        AV_PIX_FMT_GBRP14,
> +        AV_PIX_FMT_GBRP16,
> +        AV_PIX_FMT_GBRAP,
> +        AV_PIX_FMT_NONE
> +    };

Group this ones on less lines somehow.

> +
> +    AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
> +    if (!fmts_list)
> +        return AVERROR(ENOMEM);
> +    return ff_set_common_formats(ctx, fmts_list);
> +}
> +
> +#if ARCH_ARM
> +#include "libavutil/arm/cpu.h"
> +#endif
> +#if ARCH_AARCH64
> +#include "libavutil/aarch64/cpu.h"
> +#endif
> +#if ARCH_AARCH64 || ARCH_ARM
> +void ff_merge8_neon(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
> size_t bytes);
> +void ff_merge16_neon(uint16_t *dst, const uint16_t *src1, const uint16_t
> *src2, size_t bytes);
> +void ff_merge8_armv6(uint8_t *dst, const uint8_t *src1, const uint8_t
> *src2, size_t bytes);
> +void ff_merge16_armv6(uint16_t *dst, const uint16_t *src1, const uint16_t
> *src2, size_t bytes);
> +#endif
> +


I do not like this style. Look what other filters do, like one that
adds x86 SIMD.

> +static int config_props(AVFilterLink *link)
> +{
> +    AVFilterContext *ctx = link->src;
> +    FastDeintContext *s = ctx->priv;
> +    const AVPixFmtDescriptor *pix;
> +#if ARCH_AARCH64 || ARCH_ARM
> +    int cpu_flags = av_get_cpu_flags();
> +#endif

This belongs in separate directory and file. See aarch64 directory

> +
> +    link->w = link->src->inputs[0]->w;
> +    link->h = link->src->inputs[0]->h;
> +    link->time_base  = link->src->inputs[0]->time_base;
> +    link->frame_rate = link->src->inputs[0]->frame_rate;
> +    link->sample_aspect_ratio = link->src->inputs[0]->sample_aspect_ratio;
> +
> +    if (s->mode == MODE_MEAN || s->mode == MODE_DISCARD) {
> +        link->h /= 2;
> +        link->sample_aspect_ratio = av_mul_q(link->sample_aspect_ratio,
> av_make_q(1, 2));
> +    }
> +    if (s->mode == MODE_LINEAR || s->mode == MODE_BOB) {
> +        link->time_base  = av_mul_q(link->time_base,  av_make_q(1, 2));
> +        link->frame_rate = av_mul_q(link->frame_rate, av_make_q(2, 1));
> +    }
> +
> +    pix = av_pix_fmt_desc_get(link->format);
> +    s->merge_size = (pix->comp[0].depth > 8) ? 16 : 8;
> +    s->merge = s->merge_size == 16 ? (merge_fn)merge16 : (merge_fn)merge8;
> +
> +#if ARCH_ARM
> +    if (have_armv6(cpu_flags)) {
> +        s->merge = s->merge_size == 16 ? (merge_fn)ff_merge16_armv6 :
> (merge_fn)ff_merge8_armv6;
> +        s->merge_aligned = 1;
> +    }
> +#endif
> +#if ARCH_AARCH64 || ARCH_ARM
> +    if (have_neon(cpu_flags)) {
> +        s->merge = s->merge_size == 16 ? (merge_fn)ff_merge16_neon :
> (merge_fn)ff_merge8_neon;
> +        s->merge_aligned = 1;
> +    }
> +#endif
> +
> +    return 0;
> +}
> +
> +static int request_frame(AVFilterLink *link)
> +{
> +    AVFilterContext *ctx = link->src;
> +    FastDeintContext *s = ctx->priv;
> +    int ret;
> +
> +    if (s->eof)
> +        return AVERROR_EOF;
> +
> +    ret = ff_request_frame(ctx->inputs[0]);
> +
> +    if (ret == AVERROR_EOF && s->cur) {
> +        AVFrame *next = av_frame_clone(s->next);
> +        if (!next)
> +            return AVERROR(ENOMEM);
> +
> +        next->pts = s->next->pts * 2 - s->cur->pts;
> +        filter_frame(ctx->inputs[0], next);
> +        s->eof = 1;
> +    } else if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
> +#define OFFSET(x) offsetof(FastDeintContext, x)
> +#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
> +
> +#define CONST(name, help, val, unit) { name, help, 0, AV_OPT_TYPE_CONST,
> {.i64=val}, INT_MIN, INT_MAX, FLAGS, unit }
> +
> +static const AVOption fastdeint_options[] = {
> +    { "mode", "specify the deinterlacing mode", OFFSET(mode),
> AV_OPT_TYPE_INT, {.i64=MODE_BLEND}, 0, MODE_MAX-1, FLAGS, "mode" },
> +    CONST("discard", "discard bottom frame", MODE_DISCARD, "mode"),
> +    CONST("mean", "half resolution blender", MODE_MEAN, "mode"),
> +    CONST("blend", "full resolution blender", MODE_BLEND, "mode"),
> +    CONST("bob", "bob doubler", MODE_BOB, "mode"),
> +    CONST("linear", "bob doubler with linear interpolation", MODE_LINEAR,
> "mode"),
> +
> +    { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(fastdeint);
> +
> +static const AVFilterPad fastdeint_inputs[] = {
> +    {
> +        .name          = "default",
> +        .type          = AVMEDIA_TYPE_VIDEO,
> +        .filter_frame  = filter_frame,
> +    },
> +    { NULL }
> +};
> +
> +static const AVFilterPad fastdeint_outputs[] = {
> +    {
> +        .name          = "default",
> +        .type          = AVMEDIA_TYPE_VIDEO,
> +        .config_props  = config_props,
> +        .request_frame = request_frame
> +    },
> +    { NULL }
> +};
> +
> +AVFilter ff_vf_fastdeint = {
> +    .name          = "fastdeint",
> +    .description   = NULL_IF_CONFIG_SMALL("fast deinterlacing algorithms"),

First letter should be capitalized.

> +    .priv_size     = sizeof(FastDeintContext),
> +    .priv_class    = &fastdeint_class,
> +    .uninit        = uninit,
> +    .query_formats = query_formats,
> +    .inputs        = fastdeint_inputs,
> +    .outputs       = fastdeint_outputs,
> +};
> --
> 2.20.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> [email protected]
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> [email protected] with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [RFC PATCH] avfilter/fastdeint: import simple cpu-optimized deinterlacing algorithms from VLC

Reply via email to