Re: [FFmpeg-devel] [PATCH] x86: hevc_mc: remove non necessary moves

Christophe Gisquet Mon, 02 Feb 2015 09:12:04 -0800

Hi,

2015-02-02 17:16 GMT+01:00 Mickaël Raulet <mrau...@insa-rennes.fr>:
> https://github.com/OpenHEVC/FFmpeg/commit/940300945995c20f7583394ebe6907e72829b4a


No longer apply cleanly, as multiple fixes and improvements have been
committed since then.

The attached patch fixes that, and passes on a non-avx2 machine. I
can't test it, and I'm not looking forward to do debug through a ssh
shell.

And who is the actual author? It has been committed under your name,
but wouldn't that be P-E Lepere rather?

And I guess I'll drop the previous patch for now.

-- 
Christophe

From f326724af77a65acc42eaabf17db6c30e8b7f75c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Raulet?= <mrau...@insa-rennes.fr>
Date: Thu, 31 Jul 2014 01:15:20 +0200
Subject: [PATCH] x86/hevc: add MC AVX2 optimizations

before
33304 decicycles in luma_bi_1, 523066 runs, 1222 skips
38138 decicycles in luma_bi_2, 523427 runs, 861 skips
13490 decicycles in luma_uni, 516138 runs, 8150 skips
after
20185 decicycles in luma_bi_1, 519970 runs, 4318 skips
24620 decicycles in luma_bi_2, 521024 runs, 3264 skips
10397 decicycles in luma_uni, 515715 runs, 8573 skips

Conflicts:
	libavcodec/x86/hevc_mc.asm
	libavcodec/x86/hevcdsp_init.c
---
 libavcodec/x86/hevc_mc.asm    | 584 +++++++++++++++++++++++++++++++-----------
 libavcodec/x86/hevcdsp.h      | 105 ++++++++
 libavcodec/x86/hevcdsp_init.c | 371 ++++++++++++++++++++++++++-
 3 files changed, 910 insertions(+), 150 deletions(-)

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 8f9f939..e8d1e3a 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -20,19 +20,20 @@
 ; */
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-pw_8:                   times 8 dw (1 << 9)
-pw_10:                  times 8 dw (1 << 11)
-pw_12:                  times 8 dw (1 << 13)
-pw_bi_8:                times 8 dw (1 << 8)
-pw_bi_10:               times 8 dw (1 << 10)
-pw_bi_12:               times 8 dw (1 << 12)
-max_pixels_10:          times 8  dw ((1 << 10)-1)
-max_pixels_12:          times 8  dw ((1 << 12)-1)
-zero:                   times 4  dd 0
-one_per_32:             times 4  dd 1
-
-SECTION .text
+SECTION_RODATA 32
+pw_8:                   times 16 dw  (1 <<  9)
+pw_10:                  times 16 dw  (1 << 11)
+pw_12:                  times 16 dw  (1 << 13)
+pw_bi_8:                times 16 dw  (1 <<  8)
+pw_bi_10:               times 16 dw  (1 << 10)
+pw_bi_12:               times 16 dw  (1 << 12)
+max_pixels_8:           times 16 dw ((1 <<  8)-1)
+max_pixels_10:          times 16 dw ((1 << 10)-1)
+max_pixels_12:          times 16 dw ((1 << 12)-1)
+zero:                   times 8  dd 0
+one_per_32:             times 8  dd 1
+
+SECTION_TEXT 32
 %macro EPEL_TABLE 4
 hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
                         times %2 d%3 10, -2
@@ -51,6 +52,8 @@ hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
 %endmacro
 
 
+EPEL_TABLE  8,16, b, avx2
+EPEL_TABLE 10, 8, w, avx2
 
 EPEL_TABLE  8, 8, b, sse4
 EPEL_TABLE 10, 4, w, sse4
@@ -75,10 +78,15 @@ QPEL_TABLE  8, 8, b, sse4
 QPEL_TABLE 10, 4, w, sse4
 QPEL_TABLE 12, 4, w, sse4
 
+QPEL_TABLE  8,16, b, avx2
+QPEL_TABLE 10, 8, w, avx2
+
 %define MAX_PB_SIZE  64
 
 %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
 
+%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
+
 %if ARCH_X86_64
 
 %macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
@@ -87,11 +95,22 @@ QPEL_TABLE 12, 4, w, sse4
 %elif %1 <= 8
     movdqa            %3, [%2]                                              ; load data from source2
 %elif %1 <= 12
+%if avx_enabled
+    mova              %3, [%2]
+%else
     movdqa            %3, [%2]                                              ; load data from source2
     movq              %4, [%2+16]                                           ; load data from source2
+%endif ;avx
+%elif %1 <= 16
+%if avx_enabled
+    movu              %3, [%2]
 %else
     movdqa            %3, [%2]                                              ; load data from source2
     movdqa            %4, [%2+16]                                           ; load data from source2
+%endif ; avx
+%else ; %1 = 32
+    movu              %3, [%2]
+    movu              %4, [%2+32]
 %endif
 %endmacro
 
@@ -100,71 +119,108 @@ QPEL_TABLE 12, 4, w, sse4
     movd              %4, [%3]                                               ; load data from source
 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
     movq              %4, [%3]                                               ; load data from source
+%elif notcpuflag(avx)
+    movu              %4, [%3]                                               ; load data from source
+%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
+    movdqu           %4, [%3]
 %else
-    movdqu            %4, [%3]                                               ; load data from source
+    movu              %4, [%3]
 %endif
 %endmacro
 
-%macro SIMPLE_8LOAD 5    ;width, bitd, tab, r1, r2
-%if %1 == 2 || (%2 == 8 && %1 <= 4)
-    movq              %4, [%3]                                              ; load data from source2
-%elif %1 == 4 || (%2 == 8 && %1 <= 8)
-    movdqa            %4, [%3]                                              ; load data from source2
-%elif %1 <= 12
-    movdqa            %4, [%3]                                              ; load data from source2
-    movq              %5, [%3+16]                                           ; load data from source2
-%else
-    movdqa            %4, [%3]                                              ; load data from source2
-    movdqa            %5, [%3+16]                                           ; load data from source2
-%endif
-%endmacro
 
 %macro EPEL_FILTER 2-4                            ; bit depth, filter index
+%if avx_enabled
+%assign %%offset 32
+%ifdef PIC
+    lea         rfilterq, [hevc_epel_filters_avx2_%1]
+%else
+    %define rfilterq hevc_epel_filters_avx2_%1
+%endif
+%else
+%assign %%offset 16
 %ifdef PIC
     lea         rfilterq, [hevc_epel_filters_sse4_%1]
 %else
     %define rfilterq hevc_epel_filters_sse4_%1
 %endif
+%endif ;avx_enabled
     sub              %2q, 1
+%if avx_enabled
+    shl              %2q, 6                      ; multiply by 64
+  %else
     shl              %2q, 5                      ; multiply by 32
-    movdqa           %3, [rfilterq + %2q]        ; get 2 first values of filters
-    movdqa           %4, [rfilterq + %2q+16]     ; get 2 last values of filters
+%endif
+%if %0 == 2
+    mova           m14, [rfilterq + %2q]        ; get 2 first values of filters
+    mova           m15, [rfilterq + %2q+%%offset]     ; get 2 last values of filters
+%else
+    mova           %3, [rfilterq + %2q]        ; get 2 first values of filters
+    mova           %4, [rfilterq + %2q+%%offset]     ; get 2 last values of filters
+%endif
 %endmacro
 
 %macro EPEL_HV_FILTER 1
+%if avx_enabled
+%assign %%offset 32
+%assign %%shift  6
+%define %%table  hevc_epel_filters_avx2_%1
+%else
+%assign %%offset 16
+%assign %%shift  5
+%define %%table  hevc_epel_filters_sse4_%1
+%endif
+
 %ifdef PIC
-    lea         rfilterq, [hevc_epel_filters_sse4_%1]
+    lea         rfilterq, [%%table]
 %else
-    %define rfilterq hevc_epel_filters_sse4_%1
+    %define rfilterq %%table
 %endif
     sub              mxq, 1
     sub              myq, 1
-    shl              mxq, 5                      ; multiply by 32
-    shl              myq, 5                      ; multiply by 32
-    movdqa           m14, [rfilterq + mxq]        ; get 2 first values of filters
-    movdqa           m15, [rfilterq + mxq+16]     ; get 2 last values of filters
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
+    mova             m14, [rfilterq + mxq]        ; get 2 first values of filters
+    mova             m15, [rfilterq + mxq+%%offset]     ; get 2 last values of filters
     lea           r3srcq, [srcstrideq*3]
 
+%if avx_enabled
+%define %%table  hevc_epel_filters_avx2_10
+%else
+%define %%table  hevc_epel_filters_sse4_10
+%endif
 %ifdef PIC
-    lea         rfilterq, [hevc_epel_filters_sse4_10]
+    lea         rfilterq, [%%table]
 %else
-    %define rfilterq hevc_epel_filters_sse4_10
+    %define rfilterq %%table
 %endif
-    movdqa           m12, [rfilterq + myq]        ; get 2 first values of filters
-    movdqa           m13, [rfilterq + myq+16]     ; get 2 last values of filters
+    mova             m12, [rfilterq + myq]        ; get 2 first values of filters
+    mova             m13, [rfilterq + myq+%%offset]     ; get 2 last values of filters
 %endmacro
 
 %macro QPEL_FILTER 2
+
+%if avx_enabled
+%assign %%offset 32
+%assign %%shift  7
+%define %%table  hevc_qpel_filters_avx2_%1
+%else
+%assign %%offset 16
+%assign %%shift  6
+%define %%table  hevc_qpel_filters_sse4_%1
+%endif
+
 %ifdef PIC
-    lea         rfilterq, [hevc_qpel_filters_sse4_%1]
+    lea         rfilterq, [%%table]
 %else
-    %define rfilterq hevc_qpel_filters_sse4_%1
+    %define rfilterq %%table
 %endif
-    lea              %2q, [%2q*8-8]
-    movdqa           m12, [rfilterq + %2q*8]       ; get 4 first values of filters
-    movdqa           m13, [rfilterq + %2q*8 + 16]  ; get 4 first values of filters
-    movdqa           m14, [rfilterq + %2q*8 + 32]  ; get 4 first values of filters
-    movdqa           m15, [rfilterq + %2q*8 + 48]  ; get 4 first values of filters
+    sub              %2q, 1
+    shl              %2q, %%shift                        ; multiply by 32
+    mova             m12, [rfilterq + %2q]               ; get 4 first values of filters
+    mova             m13, [rfilterq + %2q +   %%offset]  ; get 4 first values of filters
+    mova             m14, [rfilterq + %2q + 2*%%offset]  ; get 4 first values of filters
+    mova             m15, [rfilterq + %2q + 3*%%offset]  ; get 4 first values of filters
 %endmacro
 
 %macro EPEL_LOAD 4
@@ -191,19 +247,18 @@ QPEL_TABLE 12, 4, w, sse4
     %%load            m2, [rfilterq+2*%3q]
     %%load            m3, [rfilterq+r3srcq]
 %endif
-
 %if %1 == 8
 %if %4 > 8
-    SBUTTERFLY        bw, 0, 1, 10
-    SBUTTERFLY        bw, 2, 3, 10
+    SBUTTERFLY        bw, 0, 1, 7
+    SBUTTERFLY        bw, 2, 3, 7
 %else
     punpcklbw         m0, m1
     punpcklbw         m2, m3
 %endif
 %else
 %if %4 > 4
-    SBUTTERFLY        wd, 0, 1, 10
-    SBUTTERFLY        wd, 2, 3, 10
+    SBUTTERFLY        wd, 0, 1, 7
+    SBUTTERFLY        wd, 2, 3, 7
 %else
     punpcklwd         m0, m1
     punpcklwd         m2, m3
@@ -220,7 +275,7 @@ QPEL_TABLE 12, 4, w, sse4
 %elif %3 == 8
 %define %%load movq
 %else
-%define %%load movdqu
+%define %%load movu
 %endif
 %else
 %if %3 == 2
@@ -228,7 +283,7 @@ QPEL_TABLE 12, 4, w, sse4
 %elif %3 == 4
 %define %%load movq
 %else
-%define %%load movdqu
+%define %%load movu
 %endif
 %endif
     %%load            m0, [%2-3*%%stride]        ;load data from source
@@ -247,10 +302,10 @@ QPEL_TABLE 12, 4, w, sse4
     SBUTTERFLY        wd, 4, 5, %4
     SBUTTERFLY        wd, 6, 7, %4
 %else
-    punpcklwd         m0, m1
-    punpcklwd         m2, m3
-    punpcklwd         m4, m5
-    punpcklwd         m6, m7
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+    punpcklbw         m4, m5
+    punpcklbw         m6, m7
 %endif
 %else
 %if %3 > 4
@@ -259,10 +314,10 @@ QPEL_TABLE 12, 4, w, sse4
     SBUTTERFLY        dq, 4, 5, %4
     SBUTTERFLY        dq, 6, 7, %4
 %else
-    punpckldq         m0, m1
-    punpckldq         m2, m3
-    punpckldq         m4, m5
-    punpckldq         m6, m7
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+    punpcklwd         m4, m5
+    punpcklwd         m6, m7
 %endif
 %endif
 %endmacro
@@ -270,14 +325,14 @@ QPEL_TABLE 12, 4, w, sse4
 %macro QPEL_V_LOAD 5
     lea              %5q, [%2]
     sub              %5q, r3srcq
-    movdqu            m0, [%5q            ]      ;load x- 3*srcstride
-    movdqu            m1, [%5q+   %3q     ]      ;load x- 2*srcstride
-    movdqu            m2, [%5q+ 2*%3q     ]      ;load x-srcstride
-    movdqu            m3, [%2       ]      ;load x
-    movdqu            m4, [%2+   %3q]      ;load x+stride
-    movdqu            m5, [%2+ 2*%3q]      ;load x+2*stride
-    movdqu            m6, [%2+r3srcq]      ;load x+3*stride
-    movdqu            m7, [%2+ 4*%3q]      ;load x+4*stride
+    movu              m0, [%5q            ]      ;load x- 3*srcstride
+    movu              m1, [%5q+   %3q     ]      ;load x- 2*srcstride
+    movu              m2, [%5q+ 2*%3q     ]      ;load x-srcstride
+    movu              m3, [%2       ]      ;load x
+    movu              m4, [%2+   %3q]      ;load x+stride
+    movu              m5, [%2+ 2*%3q]      ;load x+2*stride
+    movu              m6, [%2+r3srcq]      ;load x+3*stride
+    movu              m7, [%2+ 4*%3q]      ;load x+4*stride
 %if %1 == 8
 %if %4 > 8
     SBUTTERFLY        bw, 0, 1, 8
@@ -347,8 +402,17 @@ QPEL_TABLE 12, 4, w, sse4
     movq        [%1+16], %3
 %endmacro
 %macro PEL_10STORE16 3
+%if avx_enabled
+    movu            [%1], %2
+%else
     PEL_10STORE8      %1, %2, %3
     movdqa       [%1+16], %3
+%endif
+%endmacro
+
+%macro PEL_10STORE32 3
+    PEL_10STORE16     %1, %2, %3
+    movu         [%1+32], %3
 %endmacro
 
 %macro PEL_8STORE2 3
@@ -370,7 +434,14 @@ QPEL_TABLE 12, 4, w, sse4
     movd          [%1+8], %2
 %endmacro
 %macro PEL_8STORE16 3
-    movdqa          [%1], %2
+%if avx_enabled
+    movdqu        [%1], %2
+%else
+    mova          [%1], %2
+%endif ; avx
+%endmacro
+%macro PEL_8STORE32 3
+    movu          [%1], %2
 %endmacro
 
 %macro LOOP_END 3
@@ -381,65 +452,109 @@ QPEL_TABLE 12, 4, w, sse4
 %endmacro
 
 
-%macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
+%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
 %if %2 == 8
+%if avx_enabled && %0 ==3
+%if %1 > 16
+    vextracti128 xm1, m0, 1
+    pmovzxbw      m1, xm1
+    psllw         m1, 14-%2
+%endif
+    pmovzxbw      m0, xm0
+%else ; not avx
 %if %1 > 8
-    punpckhbw         m1, m0, m2
-    psllw             m1, 14-%2
+    punpckhbw     m1, m0, m2
+    psllw         m1, 14-%2
 %endif
-    punpcklbw         m0, m2
+    punpcklbw     m0, m2
 %endif
-    psllw             m0, 14-%2
+%endif ;avx
+    psllw         m0, 14-%2
 %endmacro
 
-
-%macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
+%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
+%if %0 == 8
+%define %%reg0 %5
+%define %%reg2 %6
+%define %%reg1 %7
+%define %%reg3 %8
+%else
+%define %%reg0 m0
+%define %%reg2 m2
+%define %%reg1 m1
+%define %%reg3 m3
+%endif
 %if %1 == 8
-    pmaddubsw         m0, %3   ;x1*c1+x2*c2
-    pmaddubsw         m2, %4   ;x3*c3+x4*c4
-    paddw             m0, m2
+%if avx_enabled && (%0 == 5)
+%if %2 > 16
+    vextracti128  xm10, m0, 1
+    vinserti128    m10, m1, xm10, 0
+%endif
+    vinserti128    m0, m0, xm1, 1
+    mova           m1, m10
+%if %2 > 16
+    vextracti128 xm10, m2, 1
+    vinserti128   m10, m3, xm10, 0
+%endif
+    vinserti128    m2, m2, xm3, 1
+    mova           m3, m10
+%endif
+    pmaddubsw      %%reg0, %3   ;x1*c1+x2*c2
+    pmaddubsw      %%reg2, %4   ;x3*c3+x4*c4
+    paddw          %%reg0, %%reg2
 %if %2 > 8
-    pmaddubsw         m1, %3
-    pmaddubsw         m3, %4
-    paddw             m1, m3
+    pmaddubsw      %%reg1, %3
+    pmaddubsw      %%reg3, %4
+    paddw          %%reg1, %%reg3
 %endif
 %else
-    pmaddwd           m0, %3
-    pmaddwd           m2, %4
-    paddd             m0, m2
+    pmaddwd        %%reg0, %3
+    pmaddwd        %%reg2, %4
+    paddd          %%reg0, %%reg2
 %if %2 > 4
-    pmaddwd           m1, %3
-    pmaddwd           m3, %4
-    paddd             m1, m3
+    pmaddwd        %%reg1, %3
+    pmaddwd        %%reg3, %4
+    paddd          %%reg1, %%reg3
+%if %1 != 8
+    psrad          %%reg1, %1-8
+%endif
 %endif
 %if %1 != 8
-    psrad             m0, %1-8
-    psrad             m1, %1-8
+    psrad          %%reg0, %1-8
 %endif
-    packssdw          m0, m1
+    packssdw       %%reg0, %%reg1
 %endif
 %endmacro
 
 %macro QPEL_HV_COMPUTE 4     ; width, bitdepth, filter idx
+
+%if avx_enabled
+%assign %%offset 32
+%define %%table  hevc_qpel_filters_avx2_%2
+%else
+%assign %%offset 16
+%define %%table  hevc_qpel_filters_sse4_%2
+%endif
+
 %ifdef PIC
-    lea         rfilterq, [hevc_qpel_filters_sse4_%2]
+    lea         rfilterq, [%%table]
 %else
-    %define rfilterq hevc_qpel_filters_sse4_%2
+    %define rfilterq %%table
 %endif
 
 %if %2 == 8
     pmaddubsw         m0, [rfilterq + %3q*8   ]   ;x1*c1+x2*c2
-    pmaddubsw         m2, [rfilterq + %3q*8+16]   ;x3*c3+x4*c4
-    pmaddubsw         m4, [rfilterq + %3q*8+32]   ;x5*c5+x6*c6
-    pmaddubsw         m6, [rfilterq + %3q*8+48]   ;x7*c7+x8*c8
+    pmaddubsw         m2, [rfilterq + %3q*8+%%offset]   ;x3*c3+x4*c4
+    pmaddubsw         m4, [rfilterq + %3q*8+2*%%offset]   ;x5*c5+x6*c6
+    pmaddubsw         m6, [rfilterq + %3q*8+3*%%offset]   ;x7*c7+x8*c8
     paddw             m0, m2
     paddw             m4, m6
     paddw             m0, m4
 %else
     pmaddwd           m0, [rfilterq + %3q*8   ]
-    pmaddwd           m2, [rfilterq + %3q*8+16]
-    pmaddwd           m4, [rfilterq + %3q*8+32]
-    pmaddwd           m6, [rfilterq + %3q*8+48]
+    pmaddwd           m2, [rfilterq + %3q*8+%%offset]
+    pmaddwd           m4, [rfilterq + %3q*8+2*%%offset]
+    pmaddwd           m6, [rfilterq + %3q*8+3*%%offset]
     paddd             m0, m2
     paddd             m4, m6
     paddd             m0, m4
@@ -448,9 +563,9 @@ QPEL_TABLE 12, 4, w, sse4
 %endif
 %if %1 > 4
     pmaddwd           m1, [rfilterq + %3q*8   ]
-    pmaddwd           m3, [rfilterq + %3q*8+16]
-    pmaddwd           m5, [rfilterq + %3q*8+32]
-    pmaddwd           m7, [rfilterq + %3q*8+48]
+    pmaddwd           m3, [rfilterq + %3q*8+%%offset]
+    pmaddwd           m5, [rfilterq + %3q*8+2*%%offset]
+    pmaddwd           m7, [rfilterq + %3q*8+3*%%offset]
     paddd             m1, m3
     paddd             m5, m7
     paddd             m1, m5
@@ -462,8 +577,32 @@ QPEL_TABLE 12, 4, w, sse4
 %endif
 %endmacro
 
-%macro QPEL_COMPUTE 2     ; width, bitdepth
+%macro QPEL_COMPUTE 2-3     ; width, bitdepth
 %if %2 == 8
+%if avx_enabled && (%0 == 3)
+
+    vextracti128 xm10, m0, 1
+    vinserti128 m10, m1, xm10, 0
+    vinserti128 m0, m0, xm1, 1
+    mova m1, m10
+
+    vextracti128 xm10, m2, 1
+    vinserti128 m10, m3, xm10, 0
+    vinserti128 m2, m2, xm3, 1
+    mova m3, m10
+
+
+    vextracti128 xm10, m4, 1
+    vinserti128 m10, m5, xm10, 0
+    vinserti128 m4, m4, xm5, 1
+    mova m5, m10
+
+    vextracti128 xm10, m6, 1
+    vinserti128 m10, m7, xm10, 0
+    vinserti128 m6, m6, xm7, 1
+    mova m7, m10
+%endif
+
     pmaddubsw         m0, m12   ;x1*c1+x2*c2
     pmaddubsw         m2, m13   ;x3*c3+x4*c4
     pmaddubsw         m4, m14   ;x5*c5+x6*c6
@@ -506,12 +645,16 @@ QPEL_TABLE 12, 4, w, sse4
 %endif
 %endmacro
 
-%macro BI_COMPUTE 7     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
+%macro BI_COMPUTE 7-8     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
     paddsw            %3, %5
 %if %1 > 8
     paddsw            %4, %6
 %endif
     UNI_COMPUTE       %1, %2, %3, %4, %7
+%if %0 == 8 && avx_enabled && (%2 == 8)
+    vpermq            %3, %3, 216
+    vpermq            %4, %4, 216
+%endif
 %endmacro
 
 %macro UNI_COMPUTE 5
@@ -524,14 +667,14 @@ QPEL_TABLE 12, 4, w, sse4
 %else
     pminsw            %3, [max_pixels_%2]
     pmaxsw            %3, [zero]
-%if %1 > 8
+%if (%1 > 8 && notcpuflag(avx)) || %1 > 16
     pminsw            %4, [max_pixels_%2]
     pmaxsw            %4, [zero]
 %endif
 %endif
 %endmacro
 
-INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
+
 ; ******************************
 ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
 ;                         uint8_t *_src, ptrdiff_t _srcstride,
@@ -539,16 +682,25 @@ INIT_XMM sse4                                    ; adds ff_ and _sse4 to functio
 ; ******************************
 
 %macro HEVC_PUT_HEVC_PEL_PIXELS 2
+HEVC_PEL_PIXELS     %1, %2
+HEVC_UNI_PEL_PIXELS %1, %2
+HEVC_BI_PEL_PIXELS  %1, %2
+%endmacro
+
+%macro HEVC_PEL_PIXELS 2
 cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
     pxor               m2, m2
 .loop
     SIMPLE_LOAD       %1, %2, srcq, m0
-    MC_PIXEL_COMPUTE  %1, %2
+    MC_PIXEL_COMPUTE  %1, %2, 1
     PEL_10STORE%1     dstq, m0, m1
     LOOP_END         dst, src, srcstride
     RET
+ %endmacro
 
-cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
+%macro HEVC_UNI_PEL_PIXELS 2
+cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
+    pxor              m2, m2
 .loop
     SIMPLE_LOAD       %1, %2, srcq, m0
     PEL_%2STORE%1   dstq, m0, m1
@@ -557,15 +709,17 @@ cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstri
     dec          heightd                         ; cmp height
     jnz               .loop                      ; height loop
     RET
+%endmacro
 
+%macro HEVC_BI_PEL_PIXELS 2
 cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
     pxor              m2, m2
     movdqa            m5, [pw_bi_%2]
 .loop
     SIMPLE_LOAD       %1, %2, srcq, m0
     SIMPLE_BILOAD     %1, src2q, m3, m4
-    MC_PIXEL_COMPUTE  %1, %2
-    BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5
+    MC_PIXEL_COMPUTE  %1, %2, 1
+    BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5, 1
     PEL_%2STORE%1   dstq, m0, m1
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
@@ -573,7 +727,6 @@ cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstrid
     dec          heightd                         ; cmp height
     jnz               .loop                      ; height loop
     RET
-
 %endmacro
 
 
@@ -591,7 +744,7 @@ cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, 11, dst, src, srcstride, height, mx, rf
     EPEL_FILTER       %2, mx, m4, m5
 .loop
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
-    EPEL_COMPUTE      %2, %1, m4, m5
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
     PEL_10STORE%1      dstq, m0, m1
     LOOP_END         dst, src, srcstride
     RET
@@ -616,9 +769,9 @@ cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 11, dst, dststride, src, srcstride,
     EPEL_FILTER       %2, mx, m4, m5
 .loop
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
-    EPEL_COMPUTE      %2, %1, m4, m5
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
     SIMPLE_BILOAD     %1, src2q, m2, m3
-    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
     PEL_%2STORE%1   dstq, m0, m1
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
@@ -640,7 +793,7 @@ cglobal hevc_put_hevc_epel_v%1_%2, 6, 7, 11, dst, src, srcstride, height, r3src,
     EPEL_FILTER       %2, my, m4, m5
 .loop
     EPEL_LOAD         %2, srcq, srcstride, %1
-    EPEL_COMPUTE      %2, %1, m4, m5
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
     PEL_10STORE%1     dstq, m0, m1
     LOOP_END          dst, src, srcstride
     RET
@@ -669,9 +822,9 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 11, dst, dststride, src, srcstride,
     EPEL_FILTER       %2, my, m4, m5
 .loop
     EPEL_LOAD         %2, srcq, srcstride, %1
-    EPEL_COMPUTE      %2, %1, m4, m5
+    EPEL_COMPUTE      %2, %1, m4, m5, 1
     SIMPLE_BILOAD     %1, src2q, m2, m3
-    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
     PEL_%2STORE%1   dstq, m0, m1
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
@@ -695,19 +848,31 @@ cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx,
     EPEL_HV_FILTER    %2
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
     SWAP              m4, m0
     add             srcq, srcstrideq
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
     SWAP              m5, m0
     add             srcq, srcstrideq
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
     SWAP              m6, m0
     add             srcq, srcstrideq
 .loop
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
     SWAP              m7, m0
     punpcklwd         m0, m4, m5
     punpcklwd         m2, m6, m7
@@ -716,10 +881,31 @@ cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx,
     punpckhwd         m3, m6, m7
 %endif
     EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+%if avx_enabled
+    vinserti128       m2, m0, xm4, 1
+    vextracti128      xm3, m0, 1
+    vinserti128       m3, m4, xm3, 0
+    PEL_10STORE%1     dstq, m2, m3
+%else
+    PEL_10STORE%1     dstq, m0, m4
+%endif
+%else
     PEL_10STORE%1     dstq, m0, m1
+%endif
     movdqa            m4, m5
     movdqa            m5, m6
     movdqa            m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
     LOOP_END         dst, src, srcstride
     RET
 
@@ -729,20 +915,32 @@ cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid
     EPEL_HV_FILTER    %2
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
     SWAP              m4, m0
     add             srcq, srcstrideq
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
     SWAP              m5, m0
     add             srcq, srcstrideq
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
     SWAP              m6, m0
     add             srcq, srcstrideq
 .loop
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m14, m15
-    SWAP              m7, m0
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
+    mova              m7, m0
     punpcklwd         m0, m4, m5
     punpcklwd         m2, m6, m7
 %if %1 > 4
@@ -750,37 +948,62 @@ cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid
     punpckhwd         m3, m6, m7
 %endif
     EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+    UNI_COMPUTE       %1, %2, m0, m4, [pw_%2]
+%else
     UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
+%endif
     PEL_%2STORE%1   dstq, m0, m1
-    movdqa            m4, m5
-    movdqa            m5, m6
-    movdqa            m6, m7
+    mova              m4, m5
+    mova              m5, m6
+    mova              m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
     dec          heightd                         ; cmp height
     jnz               .loop                      ; height loop
     RET
 
-
 cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
 %assign %%stride ((%2 + 7)/8)
     sub             srcq, srcstrideq
     EPEL_HV_FILTER    %2
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m8, m1
+%endif
     SWAP              m4, m0
     add             srcq, srcstrideq
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP              m9, m1
+%endif
     SWAP              m5, m0
     add             srcq, srcstrideq
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m10, m1
+%endif
     SWAP              m6, m0
     add             srcq, srcstrideq
 .loop
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m14, m15
+%if (%1 > 8 && (%2 == 8))
+    SWAP             m11, m1
+%endif
     SWAP              m7, m0
     punpcklwd         m0, m4, m5
     punpcklwd         m2, m6, m7
@@ -789,12 +1012,34 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride
     punpckhwd         m3, m6, m7
 %endif
     EPEL_COMPUTE      14, %1, m12, m13
+%if (%1 > 8 && (%2 == 8))
+    punpcklwd         m4, m8, m9
+    punpcklwd         m2, m10, m11
+    punpckhwd         m8, m8, m9
+    punpckhwd         m3, m10, m11
+    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
+    SIMPLE_BILOAD     %1, src2q, m8, m3
+%if avx_enabled
+    vinserti128       m1, m8, xm3, 1
+    vextracti128      xm8, m8, 1
+    vinserti128       m2, m3, xm8, 0
+    BI_COMPUTE        %1, %2, m0, m4, m1, m2, [pw_bi_%2]
+%else
+    BI_COMPUTE        %1, %2, m0, m4, m8, m3, [pw_bi_%2]
+%endif
+%else
     SIMPLE_BILOAD     %1, src2q, m8, m9
     BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
-    PEL_%2STORE%1   dstq, m0, m1
-    movdqa            m4, m5
-    movdqa            m5, m6
-    movdqa            m6, m7
+%endif
+    PEL_%2STORE%1   dstq, m0, m4
+    mova              m4, m5
+    mova              m5, m6
+    mova              m6, m7
+%if (%1 > 8 && (%2 == 8))
+    mova              m8, m9
+    mova              m9, m10
+    mova             m10, m11
+%endif
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
     add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
@@ -814,7 +1059,7 @@ cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rf
     QPEL_FILTER       %2, mx
 .loop
     QPEL_H_LOAD       %2, srcq, %1, 10
-    QPEL_COMPUTE      %1, %2
+    QPEL_COMPUTE      %1, %2, 1
 %if %2 > 8
     packssdw          m0, m1
 %endif
@@ -823,7 +1068,7 @@ cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rf
     RET
 
 cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
-    movdqa            m9, [pw_%2]
+    mova              m9, [pw_%2]
     QPEL_FILTER       %2, mx
 .loop
     QPEL_H_LOAD       %2, srcq, %1, 10
@@ -844,12 +1089,12 @@ cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride,
     QPEL_FILTER       %2, mx
 .loop
     QPEL_H_LOAD       %2, srcq, %1, 10
-    QPEL_COMPUTE      %1, %2
+    QPEL_COMPUTE      %1, %2, 1
 %if %2 > 8
     packssdw          m0, m1
 %endif
     SIMPLE_BILOAD     %1, src2q, m10, m11
-    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9
+    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
     PEL_%2STORE%1   dstq, m0, m1
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
@@ -870,7 +1115,7 @@ cglobal hevc_put_hevc_qpel_v%1_%2, 6, 8, 16, dst, src, srcstride, height, r3src,
     QPEL_FILTER       %2, my
 .loop
     QPEL_V_LOAD       %2, srcq, srcstride, %1, r7
-    QPEL_COMPUTE      %1, %2
+    QPEL_COMPUTE      %1, %2, 1
 %if %2 > 8
     packssdw          m0, m1
 %endif
@@ -901,13 +1146,13 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride,
     lea           r3srcq, [srcstrideq*3]
     QPEL_FILTER       %2, my
 .loop
-    SIMPLE_BILOAD     %1, src2q, m10, m11
     QPEL_V_LOAD       %2, srcq, srcstride, %1, r9
-    QPEL_COMPUTE      %1, %2
+    QPEL_COMPUTE      %1, %2, 1
 %if %2 > 8
     packssdw          m0, m1
 %endif
-    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9
+    SIMPLE_BILOAD     %1, src2q, m10, m11
+    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
     PEL_%2STORE%1   dstq, m0, m1
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
@@ -925,8 +1170,15 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride,
 ; ******************************
 %macro HEVC_PUT_HEVC_QPEL_HV 2
 cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
-    lea              mxq, [mxq*8-8]
-    lea              myq, [myq*8-8]
+%if avx_enabled
+%assign %%shift  4
+%else
+%assign %%shift  3
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
     lea           r3srcq, [srcstrideq*3]
     sub             srcq, r3srcq
     QPEL_H_LOAD       %2, srcq, %1, 15
@@ -994,8 +1246,15 @@ cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, m
     RET
 
 cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
-    lea              mxq, [mxq*8-8]
-    lea              myq, [myq*8-8]
+%if avx_enabled
+%assign %%shift  4
+%else
+%assign %%shift  3
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
     lea           r3srcq, [srcstrideq*3]
     sub             srcq, r3srcq
     QPEL_H_LOAD       %2, srcq, %1, 15
@@ -1053,13 +1312,13 @@ cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid
     movq             m13, m14
     movq             m14, m15
 %else
-    movdqa            m8, m9
-    movdqa            m9, m10
-    movdqa           m10, m11
-    movdqa           m11, m12
-    movdqa           m12, m13
-    movdqa           m13, m14
-    movdqa           m14, m15
+    mova            m8, m9
+    mova            m9, m10
+    mova           m10, m11
+    mova           m11, m12
+    mova           m12, m13
+    mova           m13, m14
+    mova           m14, m15
 %endif
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
@@ -1068,8 +1327,15 @@ cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid
     RET
 
 cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
-    lea              mxq, [mxq*8-8]
-    lea              myq, [myq*8-8]
+%if avx_enabled
+%assign %%shift  4
+%else
+%assign %%shift  3
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, %%shift                ; multiply by 32
+    shl              myq, %%shift                ; multiply by 32
     lea           r3srcq, [srcstrideq*3]
     sub             srcq, r3srcq
     QPEL_H_LOAD       %2, srcq, %1, 15
@@ -1286,6 +1552,8 @@ cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2,
     RET
 %endmacro
 
+INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
+
 WEIGHTING_FUNCS 2, 8
 WEIGHTING_FUNCS 4, 8
 WEIGHTING_FUNCS 6, 8
@@ -1340,6 +1608,7 @@ HEVC_PUT_HEVC_EPEL_HV 2,  8
 HEVC_PUT_HEVC_EPEL_HV 4,  8
 HEVC_PUT_HEVC_EPEL_HV 6,  8
 HEVC_PUT_HEVC_EPEL_HV 8,  8
+HEVC_PUT_HEVC_EPEL_HV 16, 8
 
 HEVC_PUT_HEVC_EPEL_HV 2, 10
 HEVC_PUT_HEVC_EPEL_HV 4, 10
@@ -1377,4 +1646,23 @@ HEVC_PUT_HEVC_QPEL_HV 4, 12
 HEVC_PUT_HEVC_QPEL_HV 6, 12
 HEVC_PUT_HEVC_QPEL_HV 8, 12
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2  ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. avx_enabled = 1 / notcpuflag(avx) = 0
+
+HEVC_PUT_HEVC_PEL_PIXELS 32, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 10
+
+HEVC_PUT_HEVC_EPEL 32, 8
+HEVC_PUT_HEVC_EPEL 16, 10
+
+HEVC_PUT_HEVC_EPEL_HV 16, 10
+HEVC_PUT_HEVC_EPEL_HV 32, 8
+
+HEVC_PUT_HEVC_QPEL 32, 8
+
+HEVC_PUT_HEVC_QPEL 16, 10
+
+HEVC_PUT_HEVC_QPEL_HV 16, 10
+
+%endif ;AVX2
 %endif ; ARCH_X86_64
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 8dea142..7864163 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -96,6 +96,40 @@ void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dstst
 EPEL_PROTOTYPES(pel_pixels ,  8, sse4);
 EPEL_PROTOTYPES(pel_pixels , 10, sse4);
 EPEL_PROTOTYPES(pel_pixels , 12, sse4);
+
+void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+
+
+
+void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
+void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
+void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
+
+
+void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
+void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+
 ///////////////////////////////////////////////////////////////////////////////
 // EPEL
 ///////////////////////////////////////////////////////////////////////////////
@@ -111,6 +145,42 @@ EPEL_PROTOTYPES(epel_hv ,  8, sse4);
 EPEL_PROTOTYPES(epel_hv , 10, sse4);
 EPEL_PROTOTYPES(epel_hv , 12, sse4);
 
+PEL_PROTOTYPE(epel_h16, 8, avx2);
+PEL_PROTOTYPE(epel_h24, 8, avx2);
+PEL_PROTOTYPE(epel_h32, 8, avx2);
+PEL_PROTOTYPE(epel_h48, 8, avx2);
+PEL_PROTOTYPE(epel_h64, 8, avx2);
+
+PEL_PROTOTYPE(epel_h16,10, avx2);
+PEL_PROTOTYPE(epel_h24,10, avx2);
+PEL_PROTOTYPE(epel_h32,10, avx2);
+PEL_PROTOTYPE(epel_h48,10, avx2);
+PEL_PROTOTYPE(epel_h64,10, avx2);
+
+PEL_PROTOTYPE(epel_v16, 8, avx2);
+PEL_PROTOTYPE(epel_v24, 8, avx2);
+PEL_PROTOTYPE(epel_v32, 8, avx2);
+PEL_PROTOTYPE(epel_v48, 8, avx2);
+PEL_PROTOTYPE(epel_v64, 8, avx2);
+
+PEL_PROTOTYPE(epel_v16,10, avx2);
+PEL_PROTOTYPE(epel_v24,10, avx2);
+PEL_PROTOTYPE(epel_v32,10, avx2);
+PEL_PROTOTYPE(epel_v48,10, avx2);
+PEL_PROTOTYPE(epel_v64,10, avx2);
+
+PEL_PROTOTYPE(epel_hv16, 8, avx2);
+PEL_PROTOTYPE(epel_hv24, 8, avx2);
+PEL_PROTOTYPE(epel_hv32, 8, avx2);
+PEL_PROTOTYPE(epel_hv48, 8, avx2);
+PEL_PROTOTYPE(epel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(epel_hv16,10, avx2);
+PEL_PROTOTYPE(epel_hv24,10, avx2);
+PEL_PROTOTYPE(epel_hv32,10, avx2);
+PEL_PROTOTYPE(epel_hv48,10, avx2);
+PEL_PROTOTYPE(epel_hv64,10, avx2);
+
 ///////////////////////////////////////////////////////////////////////////////
 // QPEL
 ///////////////////////////////////////////////////////////////////////////////
@@ -126,6 +196,41 @@ QPEL_PROTOTYPES(qpel_hv,  8, sse4);
 QPEL_PROTOTYPES(qpel_hv, 10, sse4);
 QPEL_PROTOTYPES(qpel_hv, 12, sse4);
 
+PEL_PROTOTYPE(qpel_h16, 8, avx2);
+PEL_PROTOTYPE(qpel_h24, 8, avx2);
+PEL_PROTOTYPE(qpel_h32, 8, avx2);
+PEL_PROTOTYPE(qpel_h48, 8, avx2);
+PEL_PROTOTYPE(qpel_h64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_h16,10, avx2);
+PEL_PROTOTYPE(qpel_h24,10, avx2);
+PEL_PROTOTYPE(qpel_h32,10, avx2);
+PEL_PROTOTYPE(qpel_h48,10, avx2);
+PEL_PROTOTYPE(qpel_h64,10, avx2);
+
+PEL_PROTOTYPE(qpel_v16, 8, avx2);
+PEL_PROTOTYPE(qpel_v24, 8, avx2);
+PEL_PROTOTYPE(qpel_v32, 8, avx2);
+PEL_PROTOTYPE(qpel_v48, 8, avx2);
+PEL_PROTOTYPE(qpel_v64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_v16,10, avx2);
+PEL_PROTOTYPE(qpel_v24,10, avx2);
+PEL_PROTOTYPE(qpel_v32,10, avx2);
+PEL_PROTOTYPE(qpel_v48,10, avx2);
+PEL_PROTOTYPE(qpel_v64,10, avx2);
+
+PEL_PROTOTYPE(qpel_hv16, 8, avx2);
+PEL_PROTOTYPE(qpel_hv24, 8, avx2);
+PEL_PROTOTYPE(qpel_hv32, 8, avx2);
+PEL_PROTOTYPE(qpel_hv48, 8, avx2);
+PEL_PROTOTYPE(qpel_hv64, 8, avx2);
+
+PEL_PROTOTYPE(qpel_hv16,10, avx2);
+PEL_PROTOTYPE(qpel_hv24,10, avx2);
+PEL_PROTOTYPE(qpel_hv32,10, avx2);
+PEL_PROTOTYPE(qpel_hv48,10, avx2);
+PEL_PROTOTYPE(qpel_hv64,10, avx2);
 
 WEIGHTING_PROTOTYPES(8, sse4);
 WEIGHTING_PROTOTYPES(10, sse4);
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 5a01ff6..dd5f49a 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -165,6 +165,149 @@ void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dsts
 
 #if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
 
+#define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                                      \
+void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst,                                           \
+                                                uint8_t *src, ptrdiff_t _srcstride, int height,                              \
+                                                intptr_t mx, intptr_t my, int width)                                         \
+{                                                                                                                            \
+        ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width);                 \
+        ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
+}
+
+#define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                                   \
+void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,                         \
+                                                   ptrdiff_t _srcstride, int16_t*src2,                 \
+                                                   int height, intptr_t mx, intptr_t my, int width)                          \
+{                                                                                                                            \
+        ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2,                                 \
+                                                       height, mx, my, width);                                \
+        ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,            \
+                                                       height, mx, my, width);                                \
+}
+
+#define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)                                                           \
+void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride,                                       \
+                                                    uint8_t *src, ptrdiff_t _srcstride, int height,                          \
+                                                    intptr_t mx, intptr_t my, int width)                                     \
+{                                                                                                                            \
+        ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride,                                      \
+                                                          height, mx, my, width);                                            \
+        ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride,                        \
+                                                          height, mx, my, width);                                            \
+}
+
+#define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4)    \
+mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4);            \
+mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4); \
+mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
+
+#define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                                               \
+void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst,                                           \
+                                                uint8_t *src, ptrdiff_t _srcstride, int height,                              \
+                                                intptr_t mx, intptr_t my, int width)                                         \
+{                                                                                                                            \
+        ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width);                  \
+        ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width);  \
+}
+
+#define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                                            \
+void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,                          \
+                                                   ptrdiff_t _srcstride, int16_t*src2,                 \
+                                                   int height, intptr_t mx, intptr_t my, int width)                          \
+{                                                                                                                            \
+        ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, src2,                                 \
+                                                      height, mx, my, width);                                \
+        ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, src2+width2,            \
+                                                      height, mx, my, width);                                \
+}
+
+#define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)                                                           \
+void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride,                                       \
+                                                    uint8_t *src, ptrdiff_t _srcstride, int height,                          \
+                                                    intptr_t mx, intptr_t my, int width)                                     \
+{                                                                                                                            \
+        ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride,                                      \
+                                                          height, mx, my, width);                                            \
+        ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride,                        \
+                                                          height, mx, my, width);                                            \
+}
+
+#define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2)    \
+mc_rep_mix_8(name, width1, width2, width3, opt1, opt2);            \
+mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2); \
+mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
+
+#if HAVE_AVX2_EXTERNAL
+
+mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4);
+mc_rep_mixs_8(epel_hv,    48, 32, 16, avx2, sse4);
+mc_rep_mixs_8(epel_h ,    48, 32, 16, avx2, sse4);
+mc_rep_mixs_8(epel_v ,    48, 32, 16, avx2, sse4);
+
+mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32);
+mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(epel_hv,   24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(epel_h ,   24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(epel_v ,   24, 16, 8, avx2, sse4, 32);
+
+
+mc_rep_mixs_10(qpel_h ,   24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(qpel_v ,   24, 16, 8, avx2, sse4, 32);
+mc_rep_mixs_10(qpel_hv,   24, 16, 8, avx2, sse4, 32);
+
+
+mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2);//used for 10bit
+mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2); //used for 10bit
+
+mc_rep_funcs(pel_pixels, 8, 32, 64, avx2);
+
+mc_rep_func(pel_pixels, 10, 16, 32, avx2);
+mc_rep_func(pel_pixels, 10, 16, 48, avx2);
+mc_rep_func(pel_pixels, 10, 32, 64, avx2);
+
+mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2);
+mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2);
+mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2);
+
+mc_rep_funcs(epel_h, 8, 32, 64, avx2);
+
+mc_rep_funcs(epel_v, 8, 32, 64, avx2);
+
+mc_rep_funcs(epel_h, 10, 16, 32, avx2);
+mc_rep_funcs(epel_h, 10, 16, 48, avx2);
+mc_rep_funcs(epel_h, 10, 32, 64, avx2);
+
+mc_rep_funcs(epel_v, 10, 16, 32, avx2);
+mc_rep_funcs(epel_v, 10, 16, 48, avx2);
+mc_rep_funcs(epel_v, 10, 32, 64, avx2);
+
+
+mc_rep_funcs(epel_hv,  8, 32, 64, avx2);
+
+mc_rep_funcs(epel_hv, 10, 16, 32, avx2);
+mc_rep_funcs(epel_hv, 10, 16, 48, avx2);
+mc_rep_funcs(epel_hv, 10, 32, 64, avx2);
+
+mc_rep_funcs(qpel_h, 8, 32, 64, avx2);
+mc_rep_mixs_8(qpel_h ,  48, 32, 16, avx2, sse4);
+
+mc_rep_funcs(qpel_v, 8, 32, 64, avx2);
+mc_rep_mixs_8(qpel_v,  48, 32, 16, avx2, sse4);
+
+mc_rep_funcs(qpel_h, 10, 16, 32, avx2);
+mc_rep_funcs(qpel_h, 10, 16, 48, avx2);
+mc_rep_funcs(qpel_h, 10, 32, 64, avx2);
+
+mc_rep_funcs(qpel_v, 10, 16, 32, avx2);
+mc_rep_funcs(qpel_v, 10, 16, 48, avx2);
+mc_rep_funcs(qpel_v, 10, 32, 64, avx2);
+
+mc_rep_funcs(qpel_hv, 10, 16, 32, avx2);
+mc_rep_funcs(qpel_hv, 10, 16, 48, avx2);
+mc_rep_funcs(qpel_hv, 10, 32, 64, avx2);
+
+#endif //AVX2
+
 mc_rep_funcs(pel_pixels, 8, 16, 64, sse4);
 mc_rep_funcs(pel_pixels, 8, 16, 48, sse4);
 mc_rep_funcs(pel_pixels, 8, 16, 32, sse4);
@@ -218,7 +361,6 @@ mc_rep_funcs(epel_hv, 8,  8, 64, sse4);
 mc_rep_funcs(epel_hv, 8,  8, 48, sse4);
 mc_rep_funcs(epel_hv, 8,  8, 32, sse4);
 mc_rep_funcs(epel_hv, 8,  8, 24, sse4);
-mc_rep_funcs(epel_hv, 8,  8, 16, sse4);
 mc_rep_funcs2(epel_hv,8,  8,  4, 12, sse4);
 mc_rep_funcs(epel_hv,10,  8, 64, sse4);
 mc_rep_funcs(epel_hv,10,  8, 48, sse4);
@@ -500,7 +642,7 @@ SAO_BAND_FILTER_FUNCS(8,  avx2);
 SAO_BAND_FILTER_FUNCS(10, avx2);
 SAO_BAND_FILTER_FUNCS(12, avx2);
 
-#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt )           \
+#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt)           \
         PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
         PEL_LINK(pointer, 2, my , mx , fname##6 ,  bitd, opt ); \
         PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
@@ -589,6 +731,89 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
             if (ARCH_X86_64) {
                 SAO_BAND_INIT(8, avx2);
+            c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+            c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+            c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+            c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
+            c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
+            c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
+
+            c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+            c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+            c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+            c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+            c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+            c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+
+            c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+            c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+            c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+            c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
+            c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
+            c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
+
+            c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
+            c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
+            c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
+
+            c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
+            c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
+            c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
+
+            c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
+            c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
+            c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
+
+            c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
+            c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
+            c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
+
+            c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
+            c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
+            c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
+
+            c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
+            c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
+            c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
+
+            c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
+            c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
+            c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
+
+            c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
+            c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
+            c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
+
+            c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
+            c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
+            c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
+
+            c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
+            c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
+            c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
+
+            c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
+            c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
+            c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
+
+            c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
+            c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
+            c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
+
+            c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
+            c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
+            c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
+
+            c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
+            c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
+            c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
+
+            c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
+            c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
+            c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
             }
 
             c->transform_add[3]    = ff_hevc_transform_add32_8_avx2;
@@ -648,6 +873,148 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
             if (ARCH_X86_64) {
                 SAO_BAND_INIT(10, avx2);
+            c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+            c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+            c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+            c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+            c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
+
+            c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
+            c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
+            c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
+            c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
+            c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
+
+            c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+            c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+            c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+            c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+            c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+            c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
+            c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
+            c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
+            c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
+            c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
+
+            c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+            c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+            c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+            c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+            c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+            c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
+            c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
+            c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
+            c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
+            c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
+
+            c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
+            c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
+            c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
+            c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
+            c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
+
+            c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
+            c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
+            c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
+            c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
+            c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
+
+            c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
+            c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
+            c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
+            c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
+            c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
+
+            c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
+            c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
+            c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
+            c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
+            c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
+
+            c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
+            c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
+            c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
+            c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
+            c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
+
+            c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
+            c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
+            c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
+            c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
+            c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
+
+            c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
+            c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
+            c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
+            c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
+            c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
+
+            c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
+            c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
+            c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
+            c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
+            c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
+
+            c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
+            c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
+            c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
+            c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
+            c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
+
+            c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
+            c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
+            c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
+            c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
+            c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
+
+            c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
+            c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
+            c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
+            c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
+            c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
+
+            c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
+            c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
+            c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
+            c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
+            c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
+
+            c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
+            c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
+            c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
+            c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
+            c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
+
+            c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
+            c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
+            c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
+            c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
+            c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
+
+            c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
+            c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
+            c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
+            c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
+            c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
+
+            c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
+            c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
+            c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
+            c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
+            c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
+
+            c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
+            c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
+            c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
+            c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
+            c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
+
+            c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
+            c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
+            c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
+            c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
+            c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
             }
 
             c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
-- 
1.9.2.msysgit.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH] x86: hevc_mc: remove non necessary moves

Reply via email to