From 9af4bcb4f7698aeea589598aa31f612210ceeb4d Mon Sep 17 00:00:00 2001
From: Paul B Mahol <onemda@gmail.com>
Date: Sun, 9 Aug 2020 17:47:34 +0200
Subject: [PATCH] avcodec/cfhd: add x86 SIMD

---
 libavcodec/Makefile           |   2 +-
 libavcodec/cfhd.c             | 132 ++++++--------------
 libavcodec/cfhd.h             |   3 +
 libavcodec/cfhddsp.c          |  57 +++++++++
 libavcodec/cfhddsp.h          |  36 ++++++
 libavcodec/cfhdfilter.h       |  67 +++++++++++
 libavcodec/x86/Makefile       |   2 +
 libavcodec/x86/cfhddsp.asm    | 220 ++++++++++++++++++++++++++++++++++
 libavcodec/x86/cfhddsp_init.c |  41 +++++++
 9 files changed, 461 insertions(+), 99 deletions(-)
 create mode 100644 libavcodec/cfhddsp.c
 create mode 100644 libavcodec/cfhddsp.h
 create mode 100644 libavcodec/cfhdfilter.h
 create mode 100644 libavcodec/x86/cfhddsp.asm
 create mode 100644 libavcodec/x86/cfhddsp_init.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index fc4294816e..26038e96d3 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -254,7 +254,7 @@ OBJS-$(CONFIG_CCAPTION_DECODER)        += ccaption_dec.o ass.o
 OBJS-$(CONFIG_CDGRAPHICS_DECODER)      += cdgraphics.o
 OBJS-$(CONFIG_CDTOONS_DECODER)         += cdtoons.o
 OBJS-$(CONFIG_CDXL_DECODER)            += cdxl.o
-OBJS-$(CONFIG_CFHD_DECODER)            += cfhd.o cfhddata.o
+OBJS-$(CONFIG_CFHD_DECODER)            += cfhd.o cfhddata.o cfhddsp.o
 OBJS-$(CONFIG_CFHD_ENCODER)            += cfhdenc.o cfhddata.o
 OBJS-$(CONFIG_CINEPAK_DECODER)         += cinepak.o
 OBJS-$(CONFIG_CINEPAK_ENCODER)         += cinepakenc.o elbg.o
diff --git a/libavcodec/cfhd.c b/libavcodec/cfhd.c
index f30b50beb0..9446a18dda 100644
--- a/libavcodec/cfhd.c
+++ b/libavcodec/cfhd.c
@@ -36,6 +36,7 @@
 #include "internal.h"
 #include "thread.h"
 #include "cfhd.h"
+#include "cfhdfilter.h"
 
 #define ALPHA_COMPAND_DC_OFFSET 256
 #define ALPHA_COMPAND_GAIN 9400
@@ -190,47 +191,6 @@ static inline void process_bayer(AVFrame *frame, int bpc)
     }
 }
 
-static inline void filter(int16_t *output, ptrdiff_t out_stride,
-                          int16_t *low, ptrdiff_t low_stride,
-                          int16_t *high, ptrdiff_t high_stride,
-                          int len, int clip)
-{
-    int16_t tmp;
-    int i;
-
-    tmp = (11*low[0*low_stride] - 4*low[1*low_stride] + low[2*low_stride] + 4) >> 3;
-    output[(2*0+0)*out_stride] = (tmp + high[0*high_stride]) >> 1;
-    if (clip)
-        output[(2*0+0)*out_stride] = av_clip_uintp2_c(output[(2*0+0)*out_stride], clip);
-
-    tmp = ( 5*low[0*low_stride] + 4*low[1*low_stride] - low[2*low_stride] + 4) >> 3;
-    output[(2*0+1)*out_stride] = (tmp - high[0*high_stride]) >> 1;
-    if (clip)
-        output[(2*0+1)*out_stride] = av_clip_uintp2_c(output[(2*0+1)*out_stride], clip);
-
-    for (i = 1; i < len - 1; i++) {
-        tmp = (low[(i-1)*low_stride] - low[(i+1)*low_stride] + 4) >> 3;
-        output[(2*i+0)*out_stride] = (tmp + low[i*low_stride] + high[i*high_stride]) >> 1;
-        if (clip)
-            output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
-
-        tmp = (low[(i+1)*low_stride] - low[(i-1)*low_stride] + 4) >> 3;
-        output[(2*i+1)*out_stride] = (tmp + low[i*low_stride] - high[i*high_stride]) >> 1;
-        if (clip)
-            output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
-    }
-
-    tmp = ( 5*low[i*low_stride] + 4*low[(i-1)*low_stride] - low[(i-2)*low_stride] + 4) >> 3;
-    output[(2*i+0)*out_stride] = (tmp + high[i*high_stride]) >> 1;
-    if (clip)
-        output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
-
-    tmp = (11*low[i*low_stride] - 4*low[(i-1)*low_stride] + low[(i-2)*low_stride] + 4) >> 3;
-    output[(2*i+1)*out_stride] = (tmp - high[i*high_stride]) >> 1;
-    if (clip)
-        output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
-}
-
 static inline void interlaced_vertical_filter(int16_t *output, int16_t *low, int16_t *high,
                          int width, int linesize, int plane)
 {
@@ -256,24 +216,6 @@ static inline void inverse_temporal_filter(int16_t *output, int16_t *low, int16_
     }
 }
 
-static void horiz_filter(int16_t *output, int16_t *low, int16_t *high,
-                         int width)
-{
-    filter(output, 1, low, 1, high, 1, width, 0);
-}
-
-static void horiz_filter_clip(int16_t *output, int16_t *low, int16_t *high,
-                              int width, int clip)
-{
-    filter(output, 1, low, 1, high, 1, width, clip);
-}
-
-static void horiz_filter_clip_bayer(int16_t *output, int16_t *low, int16_t *high,
-                                    int width, int clip)
-{
-    filter(output, 2, low, 1, high, 1, width, clip);
-}
-
 static void vert_filter(int16_t *output, ptrdiff_t out_stride,
                         int16_t *low, ptrdiff_t low_stride,
                         int16_t *high, ptrdiff_t high_stride, int len)
@@ -311,6 +253,8 @@ static int alloc_buffers(AVCodecContext *avctx)
         return ret;
     avctx->pix_fmt = s->coded_format;
 
+    ff_cfhddsp_init(&s->dsp, s->bpc, avctx->pix_fmt == AV_PIX_FMT_BAYER_RGGB16);
+
     if ((ret = av_pix_fmt_get_chroma_sub_sample(s->coded_format,
                                                 &chroma_x_shift,
                                                 &chroma_y_shift)) < 0)
@@ -343,13 +287,13 @@ static int alloc_buffers(AVCodecContext *avctx)
         h2 = h4 * 2;
 
         if (s->transform_type == 0) {
-            s->plane[i].idwt_size = FFALIGN(height, 8) * stride;
+            s->plane[i].idwt_size = FFALIGN(height, 8) * stride + SUBBAND_COUNT * 64;
             s->plane[i].idwt_buf =
                 av_mallocz_array(s->plane[i].idwt_size, sizeof(*s->plane[i].idwt_buf));
             s->plane[i].idwt_tmp =
                 av_malloc_array(s->plane[i].idwt_size, sizeof(*s->plane[i].idwt_tmp));
         } else {
-            s->plane[i].idwt_size = FFALIGN(height, 8) * stride * 2;
+            s->plane[i].idwt_size = FFALIGN(height, 8) * stride * 2 + SUBBAND_COUNT_3D * 64;
             s->plane[i].idwt_buf =
                 av_mallocz_array(s->plane[i].idwt_size, sizeof(*s->plane[i].idwt_buf));
             s->plane[i].idwt_tmp =
@@ -360,16 +304,16 @@ static int alloc_buffers(AVCodecContext *avctx)
             return AVERROR(ENOMEM);
 
         s->plane[i].subband[0] = s->plane[i].idwt_buf;
-        s->plane[i].subband[1] = s->plane[i].idwt_buf + 2 * w8 * h8;
-        s->plane[i].subband[2] = s->plane[i].idwt_buf + 1 * w8 * h8;
-        s->plane[i].subband[3] = s->plane[i].idwt_buf + 3 * w8 * h8;
-        s->plane[i].subband[4] = s->plane[i].idwt_buf + 2 * w4 * h4;
-        s->plane[i].subband[5] = s->plane[i].idwt_buf + 1 * w4 * h4;
-        s->plane[i].subband[6] = s->plane[i].idwt_buf + 3 * w4 * h4;
+        s->plane[i].subband[1] = s->plane[i].idwt_buf + 2 * (w8 * h8 + 64);
+        s->plane[i].subband[2] = s->plane[i].idwt_buf + 1 * (w8 * h8 + 64);
+        s->plane[i].subband[3] = s->plane[i].idwt_buf + 3 * (w8 * h8 + 64);
+        s->plane[i].subband[4] = s->plane[i].idwt_buf + 2 * (w4 * h4 + 64);
+        s->plane[i].subband[5] = s->plane[i].idwt_buf + 1 * (w4 * h4 + 64);
+        s->plane[i].subband[6] = s->plane[i].idwt_buf + 3 * (w4 * h4 + 64);
         if (s->transform_type == 0) {
-            s->plane[i].subband[7] = s->plane[i].idwt_buf + 2 * w2 * h2;
-            s->plane[i].subband[8] = s->plane[i].idwt_buf + 1 * w2 * h2;
-            s->plane[i].subband[9] = s->plane[i].idwt_buf + 3 * w2 * h2;
+            s->plane[i].subband[7] = s->plane[i].idwt_buf + 2 * (w2 * h2 + 64);
+            s->plane[i].subband[8] = s->plane[i].idwt_buf + 1 * (w2 * h2 + 64);
+            s->plane[i].subband[9] = s->plane[i].idwt_buf + 3 * (w2 * h2 + 64);
         } else {
             int16_t *frame2 =
             s->plane[i].subband[7]  = s->plane[i].idwt_buf + 4 * w2 * h2;
@@ -404,18 +348,18 @@ static int alloc_buffers(AVCodecContext *avctx)
 
         /* ll2 and ll1 commented out because they are done in-place */
         s->plane[i].l_h[0] = s->plane[i].idwt_tmp;
-        s->plane[i].l_h[1] = s->plane[i].idwt_tmp + 2 * w8 * h8;
+        s->plane[i].l_h[1] = s->plane[i].idwt_tmp + 2 * (w8 * h8 + 64);
         // s->plane[i].l_h[2] = ll2;
         s->plane[i].l_h[3] = s->plane[i].idwt_tmp;
-        s->plane[i].l_h[4] = s->plane[i].idwt_tmp + 2 * w4 * h4;
+        s->plane[i].l_h[4] = s->plane[i].idwt_tmp + 2 * (w4 * h4 + 64);
         // s->plane[i].l_h[5] = ll1;
         s->plane[i].l_h[6] = s->plane[i].idwt_tmp;
-        s->plane[i].l_h[7] = s->plane[i].idwt_tmp + 2 * w2 * h2;
+        s->plane[i].l_h[7] = s->plane[i].idwt_tmp + 2 * (w2 * h2 + 64);
         if (s->transform_type != 0) {
-            int16_t *frame2 = s->plane[i].idwt_tmp + 4 * w2 * h2;
+            int16_t *frame2 = s->plane[i].idwt_tmp + 4 * (w2 * h2 + 64);
 
             s->plane[i].l_h[8] = frame2;
-            s->plane[i].l_h[9] = frame2 + 2 * w2 * h2;
+            s->plane[i].l_h[9] = frame2 + 2 * (w2 * h2 + 64);
         }
     }
 
@@ -430,6 +374,7 @@ static int cfhd_decode(AVCodecContext *avctx, void *data, int *got_frame,
                        AVPacket *avpkt)
 {
     CFHDContext *s = avctx->priv_data;
+    CFHDDSPContext *dsp = &s->dsp;
     GetByteContext gb;
     ThreadFrame frame = { .f = data };
     AVFrame *pic = data;
@@ -1008,7 +953,7 @@ finish:
             high   = s->plane[plane].l_h[1];
             output = s->plane[plane].subband[0];
             for (i = 0; i < lowpass_height * 2; i++) {
-                horiz_filter(output, low, high, lowpass_width);
+                dsp->horiz_filter(output, low, high, lowpass_width);
                 low    += lowpass_width;
                 high   += lowpass_width;
                 output += lowpass_width * 2;
@@ -1061,7 +1006,7 @@ finish:
             high   = s->plane[plane].l_h[4];
             output = s->plane[plane].subband[0];
             for (i = 0; i < lowpass_height * 2; i++) {
-                horiz_filter(output, low, high, lowpass_width);
+                dsp->horiz_filter(output, low, high, lowpass_width);
                 low    += lowpass_width;
                 high   += lowpass_width;
                 output += lowpass_width * 2;
@@ -1128,10 +1073,7 @@ finish:
                 }
 
                 for (i = 0; i < lowpass_height * 2; i++) {
-                    if (avctx->pix_fmt == AV_PIX_FMT_BAYER_RGGB16)
-                        horiz_filter_clip_bayer(dst, low, high, lowpass_width, s->bpc);
-                    else
-                        horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
+                    dsp->horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
                     if (avctx->pix_fmt == AV_PIX_FMT_GBRAP12 && act_plane == 3)
                         process_alpha(dst, lowpass_width * 2);
                     low  += lowpass_width;
@@ -1145,7 +1087,7 @@ finish:
                 high   = s->plane[plane].subband[7];
                 output = s->plane[plane].l_h[6];
                 for (i = 0; i < lowpass_height; i++) {
-                    horiz_filter(output, low, high, lowpass_width);
+                    dsp->horiz_filter(output, low, high, lowpass_width);
                     low    += lowpass_width;
                     high   += lowpass_width;
                     output += lowpass_width * 2;
@@ -1155,7 +1097,7 @@ finish:
                 high   = s->plane[plane].subband[9];
                 output = s->plane[plane].l_h[7];
                 for (i = 0; i < lowpass_height; i++) {
-                    horiz_filter(output, low, high, lowpass_width);
+                    dsp->horiz_filter(output, low, high, lowpass_width);
                     low    += lowpass_width;
                     high   += lowpass_width;
                     output += lowpass_width * 2;
@@ -1221,7 +1163,7 @@ finish:
             high   = s->plane[plane].l_h[1];
             output = s->plane[plane].l_h[7];
             for (i = 0; i < lowpass_height * 2; i++) {
-                horiz_filter(output, low, high, lowpass_width);
+                dsp->horiz_filter(output, low, high, lowpass_width);
                 low    += lowpass_width;
                 high   += lowpass_width;
                 output += lowpass_width * 2;
@@ -1273,7 +1215,7 @@ finish:
             high   = s->plane[plane].l_h[4];
             output = s->plane[plane].l_h[7];
             for (i = 0; i < lowpass_height * 2; i++) {
-                horiz_filter(output, low, high, lowpass_width);
+                dsp->horiz_filter(output, low, high, lowpass_width);
                 low    += lowpass_width;
                 high   += lowpass_width;
                 output += lowpass_width * 2;
@@ -1310,7 +1252,7 @@ finish:
             high   = s->plane[plane].l_h[4];
             output = s->plane[plane].l_h[9];
             for (i = 0; i < lowpass_height * 2; i++) {
-                horiz_filter(output, low, high, lowpass_width);
+                dsp->horiz_filter(output, low, high, lowpass_width);
                 low    += lowpass_width;
                 high   += lowpass_width;
                 output += lowpass_width * 2;
@@ -1399,10 +1341,7 @@ finish:
                 low  = s->plane[plane].l_h[6];
                 high = s->plane[plane].l_h[7];
                 for (i = 0; i < lowpass_height * 2; i++) {
-                    if (avctx->pix_fmt == AV_PIX_FMT_BAYER_RGGB16)
-                        horiz_filter_clip_bayer(dst, low, high, lowpass_width, s->bpc);
-                    else
-                        horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
+                    dsp->horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
                     low  += lowpass_width;
                     high += lowpass_width;
                     dst  += dst_linesize;
@@ -1413,7 +1352,7 @@ finish:
                 high   = s->plane[plane].subband[14];
                 output = s->plane[plane].l_h[6];
                 for (i = 0; i < lowpass_height; i++) {
-                    horiz_filter(output, low, high, lowpass_width);
+                    dsp->horiz_filter(output, low, high, lowpass_width);
                     low    += lowpass_width;
                     high   += lowpass_width;
                     output += lowpass_width * 2;
@@ -1423,7 +1362,7 @@ finish:
                 high   = s->plane[plane].subband[16];
                 output = s->plane[plane].l_h[7];
                 for (i = 0; i < lowpass_height; i++) {
-                    horiz_filter(output, low, high, lowpass_width);
+                    dsp->horiz_filter(output, low, high, lowpass_width);
                     low    += lowpass_width;
                     high   += lowpass_width;
                     output += lowpass_width * 2;
@@ -1433,7 +1372,7 @@ finish:
                 high   = s->plane[plane].subband[11];
                 output = s->plane[plane].l_h[8];
                 for (i = 0; i < lowpass_height; i++) {
-                    horiz_filter(output, low, high, lowpass_width);
+                    dsp->horiz_filter(output, low, high, lowpass_width);
                     low    += lowpass_width;
                     high   += lowpass_width;
                     output += lowpass_width * 2;
@@ -1443,7 +1382,7 @@ finish:
                 high   = s->plane[plane].subband[13];
                 output = s->plane[plane].l_h[9];
                 for (i = 0; i < lowpass_height; i++) {
-                    horiz_filter(output, low, high, lowpass_width);
+                    dsp->horiz_filter(output, low, high, lowpass_width);
                     low    += lowpass_width;
                     high   += lowpass_width;
                     output += lowpass_width * 2;
@@ -1505,10 +1444,7 @@ finish:
                 }
 
                 for (i = 0; i < lowpass_height * 2; i++) {
-                    if (avctx->pix_fmt == AV_PIX_FMT_BAYER_RGGB16)
-                        horiz_filter_clip_bayer(dst, low, high, lowpass_width, s->bpc);
-                    else
-                        horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
+                    dsp->horiz_filter_clip(dst, low, high, lowpass_width, s->bpc);
                     low  += lowpass_width;
                     high += lowpass_width;
                     dst  += dst_linesize;
diff --git a/libavcodec/cfhd.h b/libavcodec/cfhd.h
index dc329b724b..fdc6f1e546 100644
--- a/libavcodec/cfhd.h
+++ b/libavcodec/cfhd.h
@@ -29,6 +29,7 @@
 #include "bytestream.h"
 #include "get_bits.h"
 #include "vlc.h"
+#include "cfhddsp.h"
 
 enum CFHDParam {
     SampleType       =   1,
@@ -178,6 +179,8 @@ typedef struct CFHDContext {
     uint8_t prescale_table[8];
     Plane plane[4];
     Peak peak;
+
+    CFHDDSPContext dsp;
 } CFHDContext;
 
 int ff_cfhd_init_vlcs(CFHDContext *s);
diff --git a/libavcodec/cfhddsp.c b/libavcodec/cfhddsp.c
new file mode 100644
index 0000000000..d099eaa6d1
--- /dev/null
+++ b/libavcodec/cfhddsp.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2015-2016 Kieran Kunhya <kieran@kunhya.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "libavutil/avassert.h"
+
+#include "cfhdfilter.h"
+#include "cfhddsp.h"
+
+static void horiz_filter(int16_t *output, const int16_t *low, const int16_t *high,
+                         int width)
+{
+    filter(output, 1, low, 1, high, 1, width, 0);
+}
+
+static void horiz_filter_clip(int16_t *output, const int16_t *low, const int16_t *high,
+                              int width, int clip)
+{
+    filter(output, 1, low, 1, high, 1, width, clip);
+}
+
+static void horiz_filter_clip_bayer(int16_t *output, const int16_t *low, const int16_t *high,
+                                    int width, int clip)
+{
+    filter(output, 2, low, 1, high, 1, width, clip);
+}
+
+av_cold void ff_cfhddsp_init(CFHDDSPContext *c, int depth, int bayer)
+{
+    c->horiz_filter = horiz_filter;
+
+    if (bayer)
+        c->horiz_filter_clip = horiz_filter_clip_bayer;
+    else
+        c->horiz_filter_clip = horiz_filter_clip;
+
+    if (ARCH_X86)
+        ff_cfhddsp_init_x86(c, depth, bayer);
+}
diff --git a/libavcodec/cfhddsp.h b/libavcodec/cfhddsp.h
new file mode 100644
index 0000000000..b147972bd9
--- /dev/null
+++ b/libavcodec/cfhddsp.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_CFHDDSP_H
+#define AVCODEC_CFHDDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct CFHDDSPContext {
+    void (*horiz_filter)(int16_t *output, const int16_t *low, const int16_t *high,
+                         int width);
+    void (*horiz_filter_clip)(int16_t *output, const int16_t *low, const int16_t *high,
+                              int width, int bpc);
+} CFHDDSPContext;
+
+void ff_cfhddsp_init(CFHDDSPContext *c, int format, int bayer);
+
+void ff_cfhddsp_init_x86(CFHDDSPContext *c, int format, int bayer);
+
+#endif /* AVCODEC_CFHDDSP_H */
diff --git a/libavcodec/cfhdfilter.h b/libavcodec/cfhdfilter.h
new file mode 100644
index 0000000000..2096660001
--- /dev/null
+++ b/libavcodec/cfhdfilter.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2015-2016 Kieran Kunhya <kieran@kunhya.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_CFHDFILTER_H
+#define AVCODEC_CFHDFILTER_H
+
+#include <stdint.h>
+
+static av_always_inline void filter(int16_t *output, ptrdiff_t out_stride,
+                          const int16_t *low, ptrdiff_t low_stride,
+                          const int16_t *high, ptrdiff_t high_stride,
+                          int len, int clip)
+{
+    int16_t tmp;
+    int i;
+
+    tmp = (11*low[0*low_stride] - 4*low[1*low_stride] + low[2*low_stride] + 4) >> 3;
+    output[(2*0+0)*out_stride] = (tmp + high[0*high_stride]) >> 1;
+    if (clip)
+        output[(2*0+0)*out_stride] = av_clip_uintp2_c(output[(2*0+0)*out_stride], clip);
+
+    tmp = ( 5*low[0*low_stride] + 4*low[1*low_stride] - low[2*low_stride] + 4) >> 3;
+    output[(2*0+1)*out_stride] = (tmp - high[0*high_stride]) >> 1;
+    if (clip)
+        output[(2*0+1)*out_stride] = av_clip_uintp2_c(output[(2*0+1)*out_stride], clip);
+
+    for (i = 1; i < len - 1; i++) {
+        tmp = (low[(i-1)*low_stride] - low[(i+1)*low_stride] + 4) >> 3;
+        output[(2*i+0)*out_stride] = (tmp + low[i*low_stride] + high[i*high_stride]) >> 1;
+        if (clip)
+            output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
+
+        tmp = (low[(i+1)*low_stride] - low[(i-1)*low_stride] + 4) >> 3;
+        output[(2*i+1)*out_stride] = (tmp + low[i*low_stride] - high[i*high_stride]) >> 1;
+        if (clip)
+            output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
+    }
+
+    tmp = ( 5*low[i*low_stride] + 4*low[(i-1)*low_stride] - low[(i-2)*low_stride] + 4) >> 3;
+    output[(2*i+0)*out_stride] = (tmp + high[i*high_stride]) >> 1;
+    if (clip)
+        output[(2*i+0)*out_stride] = av_clip_uintp2_c(output[(2*i+0)*out_stride], clip);
+
+    tmp = (11*low[i*low_stride] - 4*low[(i-1)*low_stride] + low[(i-2)*low_stride] + 4) >> 3;
+    output[(2*i+1)*out_stride] = (tmp - high[i*high_stride]) >> 1;
+    if (clip)
+        output[(2*i+1)*out_stride] = av_clip_uintp2_c(output[(2*i+1)*out_stride], clip);
+}
+
+#endif /* AVCODEC_CFHDFILTER_H */
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 194135dafb..884dc0c759 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -50,6 +50,7 @@ OBJS-$(CONFIG_ADPCM_G722_ENCODER)      += x86/g722dsp_init.o
 OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
 OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
+OBJS-$(CONFIG_CFHD_DECODER)            += x86/cfhddsp_init.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
 OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
@@ -153,6 +154,7 @@ X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
 X86ASM-OBJS-$(CONFIG_ALAC_DECODER)     += x86/alacdsp.o
 X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
 X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
+X86ASM-OBJS-$(CONFIG_CFHD_DECODER)     += x86/cfhddsp.o
 X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
 X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
                                           x86/dirac_dwt.o
diff --git a/libavcodec/x86/cfhddsp.asm b/libavcodec/x86/cfhddsp.asm
new file mode 100644
index 0000000000..9e1aca88d8
--- /dev/null
+++ b/libavcodec/x86/cfhddsp.asm
@@ -0,0 +1,220 @@
+;******************************************************************************
+;* x86-optimized functions for the CFHD decoder
+;* Copyright (c) 2020 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+factor_p1_p1: dw 1,  1, 1,  1, 1,  1, 1,  1,
+factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
+pd_4: times 4 dd 4
+pw_0: times 8 dw 0
+pd_m1: times 4 dd -1
+pw_1023: times 8 dw 1023
+
+SECTION .text
+
+%macro CFHD_HORIZ_FILTER 1
+%if %1
+cglobal cfhd_horiz_filter_clip10, 5, 6, 8, output, low, high, width, bpc
+    DEFINE_ARGS    output, low, high, width, x, temp
+%else
+cglobal cfhd_horiz_filter, 4, 6, 8, output, low, high, width, x
+    DEFINE_ARGS    output, low, high, width, x, temp
+%endif
+    shl        widthd, 1
+
+    movsx          xq, word [lowq]
+    imul           xq, 11
+
+    movsx       tempq, word [lowq + 2]
+    imul        tempq, -4
+    add         tempq, xq
+
+    movsx          xq, word [lowq + 4]
+    add         tempq, xq
+    add         tempq, 4
+    sar         tempq, 3
+
+    movsx          xq, word [highq]
+    add         tempq, xq
+    sar         tempq, 1
+
+%if %1
+    movd           m0, tempq
+    CLIPW          m0, [pw_0], [pw_1023]
+    pextrw  [outputq], xm0, 0
+%else
+    mov  word [outputq], tempw
+%endif
+
+    movsx          xq, word [lowq]
+    imul           xq, 5
+
+    movsx       tempq, word [lowq + 2]
+    imul        tempq, 4
+    add         tempq, xq
+
+    movsx          xq, word [lowq + 4]
+    sub         tempq, xq
+    add         tempq, 4
+    sar         tempq, 3
+
+    movsx          xq, word [highq]
+    sub         tempq, xq
+    sar         tempq, 1
+
+%if %1
+    movd           m0, tempq
+    CLIPW          m0, [pw_0], [pw_1023]
+    pextrw [outputq + 2], xm0, 0
+%else
+    mov  word [outputq + 2], tempw
+%endif
+
+    mov            xq, 0
+
+.loop:
+    movu           m4, [lowq + xq]
+    movu           m1, [lowq + xq + 4]
+
+    mova           m5, m4
+    punpcklwd      m4, m1
+    punpckhwd      m5, m1
+
+    pmaddwd        m4, [factor_p1_n1]
+    pmaddwd        m5, [factor_p1_n1]
+
+    mova           m6, m4
+    mova           m7, m5
+    psignd         m6, m6, [pd_m1]
+    psignd         m7, m7, [pd_m1]
+
+    paddd          m4, [pd_4]
+    paddd          m5, [pd_4]
+    paddd          m6, [pd_4]
+    paddd          m7, [pd_4]
+
+    psrad          m4, 3
+    psrad          m5, 3
+    psrad          m6, 3
+    psrad          m7, 3
+
+    movu           m2, [lowq + xq + 2]
+    movu           m3, [highq + xq + 2]
+
+    mova           m0, m2
+    punpcklwd      m2, m3
+    punpckhwd      m0, m3
+
+    mova           m1, m2
+    mova           m3, m0
+
+    pmaddwd        m2, [factor_p1_p1]
+    pmaddwd        m0, [factor_p1_p1]
+    pmaddwd        m1, [factor_p1_n1]
+    pmaddwd        m3, [factor_p1_n1]
+
+    paddd          m2, m4
+    paddd          m0, m5
+    paddd          m1, m6
+    paddd          m3, m7
+
+    psrad          m2, 1
+    psrad          m0, 1
+    psrad          m1, 1
+    psrad          m3, 1
+
+    packssdw       m2, m0
+    packssdw       m1, m3
+
+    mova           m0, m2
+    punpcklwd      m2, m1
+    punpckhwd      m0, m1
+
+%if %1
+    CLIPW          m2, [pw_0], [pw_1023]
+    CLIPW          m0, [pw_0], [pw_1023]
+%endif
+
+    movu  [outputq + xq * 2 + 4], m2
+    movu  [outputq + xq * 2 + mmsize + 4], m0
+
+    add            xq, mmsize
+    cmp            xq, widthq
+    jl .loop
+
+    movsx          xq, word [lowq + widthq - 2]
+    imul           xq, 5
+
+    movsx       tempq, word [lowq + widthq - 4]
+    imul        tempq, 4
+    add         tempq, xq
+
+    movsx          xq, word [lowq + widthq - 6]
+    sub         tempq, xq
+    add         tempq, 4
+    sar         tempq, 3
+
+    movsx          xq, word [highq + widthq - 2]
+    add         tempq, xq
+    sar         tempq, 1
+
+%if %1
+    movd           m0, tempq
+    CLIPW          m0, [pw_0], [pw_1023]
+    pextrw  [outputq + widthq * 2 - 4], xm0, 0
+%else
+    mov  word [outputq + widthq * 2 - 4], tempw
+%endif
+
+    movsx          xq, word [lowq + widthq - 2]
+    imul           xq, 11
+
+    movsx       tempq, word [lowq + widthq - 4]
+    imul        tempq, -4
+    add         tempq, xq
+
+    movsx          xq, word [lowq + widthq - 6]
+    add         tempq, xq
+    add         tempq, 4
+    sar         tempq, 3
+
+    movsx          xq, word [highq + widthq - 2]
+    sub         tempq, xq
+    sar         tempq, 1
+
+%if %1
+    movd          m0, tempq
+    CLIPW          m0, [pw_0], [pw_1023]
+    pextrw  [outputq + widthq * 2 - 2], xm0, 0
+%else
+    mov  word [outputq + widthq * 2 - 2], tempw
+%endif
+
+    RET
+%endmacro
+
+INIT_XMM ssse3
+CFHD_HORIZ_FILTER 0
+
+INIT_XMM ssse3
+CFHD_HORIZ_FILTER 1
diff --git a/libavcodec/x86/cfhddsp_init.c b/libavcodec/x86/cfhddsp_init.c
new file mode 100644
index 0000000000..a7db61a344
--- /dev/null
+++ b/libavcodec/x86/cfhddsp_init.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/cfhddsp.h"
+
+void ff_cfhd_horiz_filter_ssse3(int16_t *output, const int16_t *low, const int16_t *high, int width);
+void ff_cfhd_horiz_filter_clip10_ssse3(int16_t *output, const int16_t *low, const int16_t *high, int width, int bpc);
+
+av_cold void ff_cfhddsp_init_x86(CFHDDSPContext *c, int depth, int bayer)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->horiz_filter = ff_cfhd_horiz_filter_ssse3;
+        if (depth == 10 && !bayer)
+            c->horiz_filter_clip = ff_cfhd_horiz_filter_clip10_ssse3;
+    }
+}
-- 
2.26.2

