Hi, Martin,

Thanks for your comments.

I have now amended the unreasonable parts of ldp/stp that I have seen.  And I updated patch 3 and patch 5. (Although I have attached all 5 patches) In addition, I thought that q8-q15 was required to be saved according to the calling convention before, but later I confirmed that it is the lower 64bit, thank you for reminding.

Please take a look. If there are some small mistakes, please correct them directly. If there are still many problems, please remind me again, thank you!


在 2023/7/2 5:28, Martin Storsjö 写道:
On Sun, 18 Jun 2023, Logan.Lyu wrote:

Hi, Martin,

I modified it according to your comments. Please review again.

From 47b7f7af634add7680b56a216fff7dbe1f08cd11 Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Sun, 28 May 2023 10:35:43 +0800
Subject: [PATCH 5/5] lavc/aarch64: new optimization for 8-bit
 hevc_epel_uni_w_hv

Signed-off-by: Logan Lyu <logan....@myais.com.cn>
---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 694 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   6 +
 2 files changed, 700 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 8b6f396a0b..355679af29 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -717,6 +717,700 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
         ret
 endfunc

+.macro epel_uni_w_hv_start
+        mov             x15, x5         //denom
+        mov             x16, x6         //wx
+        mov             x17, x7         //ox
+        add             w15, w15, #6    //shift = denom+6
+
+
+        ldp             x5, x6, [sp]
+        ldr             x7, [sp, #16]
+
+        stp             q12, q13, [sp, #-128]!
+        stp             q14, q15, [sp, #32]
+        stp             q8, q9,   [sp, #64]
+        stp             q10, q11, [sp, #96]

Only need to back up 64 bytes, by backing up d8-d15. Also, the order
is quite weird here, why not keep them in e.g. linear order?

+function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             xzr, x30, [sp, #-48]!

As mentioned already in the previous review - why do you back up and
restore xzr here? That's not necessary. Yes, you should keep the stack
16 byte aligned, but you can just leave an empty slot, and just do
"str x30, [sp, #-48]!" here, and vice versa with "ldr" instead of ldp
when restoring.

The same goes in all functions here.

+2:
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        ldp             q12, q13, [sp], #128

Only need d8-d15, and weird register order here, and elsewhere.

+function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4

FWIW, it's unusual to need an explicit sxtw instruction, but I guess
if you use it in the form "add x10, x4, #3" it might be needed.

+function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!

Don't do consecutive stack pointer updates like this, but merge it
into one large stack decrement followed by positive offsets, like in
all the other cases of stp/ldp.

+        mov             x17, #16
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x17, #16
+        stp             x17, xzr, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!

Don't do multiple stack decrements, don't needlessly store xzr here.

The same goes for all the other functions in this patch.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
From c7959c64da41d2e6a14cbd3afa019fa1792d9767 Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Sat, 27 May 2023 09:42:07 +0800
Subject: [PATCH v1 3/5] lavc/aarch64: new optimization for 8-bit
 hevc_epel_uni_w_v

---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 503 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   6 +
 2 files changed, 509 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 0411de9864..0e3bf74953 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -375,3 +375,506 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, 
export=1
 endfunc
 
 #endif
+
+
+.macro EPEL_UNI_W_V_HEADER
+        ldr             x12, [sp, #8]
+        movrel          x9, epel_filters
+        add             x9, x9, x12, lsl #2
+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b}, [x9] // filter
+        neg             v0.16b, v0.16b
+        neg             v3.16b, v3.16b
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+        sub             x2, x2, x3
+.endm
+
+.macro EPEL_UNI_W_V4_CALC d0, s0, s1, s2, s3
+        movi            \d0\().2d, #0
+        umlsl           \d0\().8h, \s0\().8b, v0.8b
+        umlal           \d0\().8h, \s1\().8b, v1.8b
+        umlal           \d0\().8h, \s2\().8b, v2.8b
+        umlsl           \d0\().8h, \s3\().8b, v3.8b
+        smull           \d0\().4s, \d0\().4h, v30.4h
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+        sqxtn           \d0\().4h, \d0\().4s
+        sqxtun          \d0\().8b, \d0\().8h
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_v4_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldr             s4, [x2]
+        ldr             s5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s6, [x2]
+1:
+        ldr             s7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V4_CALC v16, v4, v5, v6, v7
+        str             s16, [x0]
+        b.eq            2f
+        add             x0, x0, x1
+        ldr             s4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V4_CALC v17, v5, v6, v7, v4
+        str             s17, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             s5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V4_CALC v18, v6, v7, v4, v5
+        str             s18, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             s6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V4_CALC v19, v7, v4, v5, v6
+        str             s19, [x0]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+.macro EPEL_UNI_W_V8_CALC d0, s0, s1, s2, s3, t0, t1
+        movi            \d0\().2d, #0
+        umlsl           \d0\().8h, \s0\().8b, v0.8b
+        umlal           \d0\().8h, \s1\().8b, v1.8b
+        umlal           \d0\().8h, \s2\().8b, v2.8b
+        umlsl           \d0\().8h, \s3\().8b, v3.8b
+        smull           \t0\().4s, \d0\().4h, v30.4h
+        smull2          \t1\().4s, \d0\().8h, v30.8h
+        sqrshl          \t0\().4s, \t0\().4s, v31.4s
+        sqrshl          \t1\().4s, \t1\().4s, v31.4s
+        sqadd           \t0\().4s, \t0\().4s, v29.4s
+        sqadd           \t1\().4s, \t1\().4s, v29.4s
+        sqxtn           \d0\().4h, \t0\().4s
+        sqxtn2          \d0\().8h, \t1\().4s
+        sqxtun          \d0\().8b, \d0\().8h
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_v6_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        sub             x1, x1, #4
+        ldr             d4, [x2]
+        ldr             d5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d6, [x2]
+1:
+        ldr             d7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V8_CALC v16, v4, v5, v6, v7, v20, v21
+        str             s16, [x0], #4
+        st1             {v16.h}[2], [x0], x1
+        b.eq            2f
+        ldr             d4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V8_CALC v17, v5, v6, v7, v4, v20, v21
+        str             s17, [x0], #4
+        st1             {v17.h}[2], [x0], x1
+        b.eq            2f
+        ldr             d5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V8_CALC v18, v6, v7, v4, v5, v20, v21
+        str             s18, [x0], #4
+        st1             {v18.h}[2], [x0], x1
+        b.eq            2f
+        ldr             d6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V8_CALC v19, v7, v4, v5, v6, v20, v21
+        str             s19, [x0], #4
+        st1             {v19.h}[2], [x0], x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v8_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldr             d4, [x2]
+        ldr             d5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d6, [x2]
+1:
+        ldr             d7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V8_CALC v16, v4, v5, v6, v7, v20, v21
+        str             d16, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             d4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V8_CALC v17, v5, v6, v7, v4, v20, v21
+        str             d17, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             d5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V8_CALC v18, v6, v7, v4, v5, v20, v21
+        str             d18, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             d6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V8_CALC v19, v7, v4, v5, v6, v20, v21
+        str             d19, [x0]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+.macro EPEL_UNI_W_V12_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        umlsl           \d0\().8h, \s0\().8b, v0.8b
+        umlsl2          \d1\().8h, \s0\().16b, v0.16b
+        umlal           \d0\().8h, \s1\().8b, v1.8b
+        umlal2          \d1\().8h, \s1\().16b, v1.16b
+        umlal           \d0\().8h, \s2\().8b, v2.8b
+        umlal2          \d1\().8h, \s2\().16b, v2.16b
+        umlsl           \d0\().8h, \s3\().8b, v3.8b
+        umlsl2          \d1\().8h, \s3\().16b, v3.16b
+
+        smull           \t0\().4s, \d0\().4h, v30.4h
+        smull2          \t1\().4s, \d0\().8h, v30.8h
+        smull           \t2\().4s, \d1\().4h, v30.4h
+
+        sqrshl          \t0\().4s, \t0\().4s, v31.4s
+        sqrshl          \t1\().4s, \t1\().4s, v31.4s
+        sqrshl          \t2\().4s, \t2\().4s, v31.4s
+        sqadd           \t0\().4s, \t0\().4s, v29.4s
+        sqadd           \t1\().4s, \t1\().4s, v29.4s
+        sqadd           \t2\().4s, \t2\().4s, v29.4s
+
+        sqxtn           \d0\().4h, \t0\().4s
+        sqxtn2          \d0\().8h, \t1\().4s
+        sqxtn           \d1\().4h, \t2\().4s
+        sqxtun          \d0\().8b,  \d0\().8h
+        sqxtun2         \d0\().16b, \d1\().8h
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_v12_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldr             q4, [x2]
+        ldr             q5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q6, [x2]
+        sub             x1, x1, #8
+1:
+        ldr             q7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V12_CALC v16, v17, v4, v5, v6, v7, v24, v25, v26, v27
+        str             d16, [x0], #8
+        st1             {v16.s}[2], [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V12_CALC v18, v19, v5, v6, v7, v4, v24, v25, v26, v27
+        str             d18, [x0], #8
+        st1             {v18.s}[2], [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V12_CALC v20, v21, v6, v7, v4, v5, v24, v25, v26, v27
+        str             d20, [x0], #8
+        st1             {v20.s}[2], [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V12_CALC v22, v23, v7, v4, v5, v6, v24, v25, v26, v27
+        str             d22, [x0], #8
+        st1             {v22.s}[2], [x0]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+.macro EPEL_UNI_W_V16_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        umlsl           \d0\().8h, \s0\().8b, v0.8b
+        umlsl2          \d1\().8h, \s0\().16b, v0.16b
+        umlal           \d0\().8h, \s1\().8b, v1.8b
+        umlal2          \d1\().8h, \s1\().16b, v1.16b
+        umlal           \d0\().8h, \s2\().8b, v2.8b
+        umlal2          \d1\().8h, \s2\().16b, v2.16b
+        umlsl           \d0\().8h, \s3\().8b, v3.8b
+        umlsl2          \d1\().8h, \s3\().16b, v3.16b
+
+        smull           \t0\().4s, \d0\().4h, v30.4h
+        smull2          \t1\().4s, \d0\().8h, v30.8h
+        smull           \t2\().4s, \d1\().4h, v30.4h
+        smull2          \t3\().4s, \d1\().8h, v30.8h
+
+        sqrshl          \t0\().4s, \t0\().4s, v31.4s
+        sqrshl          \t1\().4s, \t1\().4s, v31.4s
+        sqrshl          \t2\().4s, \t2\().4s, v31.4s
+        sqrshl          \t3\().4s, \t3\().4s, v31.4s
+        sqadd           \t0\().4s, \t0\().4s, v29.4s
+        sqadd           \t1\().4s, \t1\().4s, v29.4s
+        sqadd           \t2\().4s, \t2\().4s, v29.4s
+        sqadd           \t3\().4s, \t3\().4s, v29.4s
+
+        sqxtn           \d0\().4h, \t0\().4s
+        sqxtn2          \d0\().8h, \t1\().4s
+        sqxtn           \d1\().4h, \t2\().4s
+        sqxtn2          \d1\().8h, \t3\().4s
+        sqxtun          \d0\().8b,  \d0\().8h
+        sqxtun2         \d0\().16b, \d1\().8h
+.endm
+
+
+function ff_hevc_put_hevc_epel_uni_w_v16_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldr             q4, [x2]
+        ldr             q5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q6, [x2]
+1:
+        ldr             q7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V16_CALC v16, v17 v4, v5, v6, v7, v24, v25, v26, v27
+        str             q16, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v18, v19, v5, v6, v7, v4, v24, v25, v26, v27
+        str             q18, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V16_CALC v20, v21, v6, v7, v4, v5, v24, v25, v26, v27
+        str             q20, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v22, v23, v7, v4, v5, v6, v24, v25, v26, v27
+        str             q22, [x0]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_epel_uni_w_v24_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldp             q16, q17, [x2]
+        add             x2, x2, x3
+        ldp             q18, q19, [x2]
+        add             x2, x2, x3
+        ldp             q20, q21, [x2]
+        add             x2, x2, x3
+1:
+        ldp             q22, q23, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v16, v18, v20, v22, v24, v25, v26, v27
+        EPEL_UNI_W_V8_CALC  v6, v17, v19, v21, v23, v24, v25
+        str             q4, [x0]
+        str             d6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q16, q17, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v18, v20, v22, v16, v24, v25, v26, v27
+        EPEL_UNI_W_V8_CALC  v6, v19, v21, v23, v17, v24, v25
+        str             q4, [x0]
+        str             d6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q18, q19, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v20, v22, v16, v18,  v24, v25, v26, v27
+        EPEL_UNI_W_V8_CALC  v6, v21, v23, v17, v19, v24, v25
+        str             q4, [x0]
+        str             d6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q20, q21, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v22, v16, v18, v20, v24, v25, v26, v27
+        EPEL_UNI_W_V8_CALC  v6, v23, v17, v19, v21, v24, v25
+        str             q4, [x0]
+        str             d6, [x0, #16]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v32_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldp             q16, q17, [x2]
+        add             x2, x2, x3
+        ldp             q18, q19, [x2]
+        add             x2, x2, x3
+        ldp             q20, q21, [x2]
+        add             x2, x2, x3
+1:
+        ldp             q22, q23, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v16, v18, v20, v22, v24, v25, v26, v27
+        EPEL_UNI_W_V16_CALC v6, v7, v17, v19, v21, v23, v24, v25, v26, v27
+        str             q4, [x0]
+        str             q6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q16, q17, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v18, v20, v22, v16, v24, v25, v26, v27
+        EPEL_UNI_W_V16_CALC v6, v7, v19, v21, v23, v17, v24, v25, v26, v27
+        str             q4, [x0]
+        str             q6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q18, q19, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v20, v22, v16, v18,  v24, v25, v26, v27
+        EPEL_UNI_W_V16_CALC v6, v7, v21, v23, v17, v19, v24, v25, v26, v27
+        str             q4, [x0]
+        str             q6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q20, q21, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v22, v16, v18, v20, v24, v25, v26, v27
+        EPEL_UNI_W_V16_CALC v6, v7, v23, v17, v19, v21, v24, v25, v26, v27
+        str             q4, [x0]
+        str             q6, [x0, #16]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v48_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+        stp             d8, d9, [sp, #-32]!
+        stp             d10, d11, [sp, #16]
+
+        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
+        ld1             {v19.16b, v20.16b, v21.16b}, [x2], x3
+        ld1             {v22.16b, v23.16b, v24.16b}, [x2], x3
+1:
+        ld1             {v25.16b, v26.16b, v27.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v16, v19, v22, v25, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v17, v20, v23, v26, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v18, v21, v24, v27, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v19, v22, v25, v16, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v20, v23, v26, v17, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v21, v24, v27, v18, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v19.16b, v20.16b, v21.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6,  v22, v25, v16, v19, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7,  v23, v26, v17, v20, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7,  v24, v27, v18, v21, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v22.16b, v23.16b, v24.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6,  v25, v16, v19, v22, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7,  v26, v17, v20, v23, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7,  v27, v18, v21, v24, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+        b.hi            1b
+2:
+        ldp             d10, d11, [sp, #16]
+        ldp             d8, d9, [sp], #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v64_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+        stp             d8, d9, [sp, #-64]!
+        stp             d10, d11, [sp, #16]
+        stp             d12, d13, [sp, #32]
+        stp             d14, d15, [sp, #48]
+
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+1:
+        ld1             {v12.16b, v13.16b, v14.16b, v15.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v16, v20, v24, v12, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v17, v21, v25, v13, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v18, v22, v26, v14, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v7,v28, v19, v23, v27, v15, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v20, v24, v12, v16, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v21, v25, v13, v17, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v22, v26, v14, v18, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v7,v28, v23, v27, v15, v19, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v24, v12, v16, v20, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v25, v13, v17, v21, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v26, v14, v18, v22, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v7,v28, v27, v15, v19, v23, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v12, v16, v20, v24, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v13, v17, v21, v25, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v14, v18, v22, v26, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v7,v28, v15, v19, v23, v27, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        b.hi            1b
+2:
+        ldp             d10, d11, [sp, #16]
+        ldp             d12, d13, [sp, #32]
+        ldp             d14, d15, [sp, #48]
+        ldp             d8, d9, [sp], #64
+        ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 8af0a2b4b9..4a260e1d9a 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -161,6 +161,11 @@ NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t 
_dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -274,6 +279,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
         NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
         if (have_i8mm(cpu_flags)) {
-- 
2.38.0.windows.1

From f07eee2c6cdeb0260c00a1ec49a0dddb6b9df9db Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Sun, 28 May 2023 10:30:28 +0800
Subject: [PATCH v1 4/5] lavc/aarch64: new optimization for 8-bit hevc_epel_h

---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 343 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 348 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 0e3bf74953..8942a41cbf 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -33,6 +33,349 @@ const epel_filters, align=4
 endconst
 
 #if HAVE_I8MM
+
+.macro EPEL_H_HEADER
+        movrel          x5, epel_filters
+        add             x5, x5, x4, lsl #2
+        ld1r            {v30.4s}, [x5]
+        sub             x1, x1, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+.endm
+
+function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.8b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.8b, v4.8b, v4.8b, #1
+        ext             v6.8b, v4.8b, v4.8b, #2
+        ext             v7.8b, v4.8b, v4.8b, #3
+        trn1            v4.2s, v4.2s, v5.2s
+        trn1            v6.2s, v6.2s, v7.2s
+        trn1            v4.2d, v4.2d, v6.2d
+        movi            v16.2d, #0
+        usdot           v16.4s, v4.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        st1             {v16.4h}, [x0], x10
+        b.ne            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_epel_h6_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.16b},  [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v4.16b, v4.16b, #1
+        ext             v6.8b, v4.8b, v4.8b, #2
+        ext             v7.8b, v4.8b, v4.8b, #3
+        trn1            v16.2s, v4.2s, v5.2s
+        trn2            v17.2s, v4.2s, v5.2s
+        trn1            v6.2s, v6.2s, v7.2s
+        trn1            v16.2d, v16.2d, v6.2d
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        usdot           v18.4s, v16.16b, v30.16b
+        usdot           v19.2s, v17.8b, v30.8b
+        xtn             v18.4h, v18.4s
+        xtn             v19.4h, v19.4s
+        str             d18, [x0]
+        str             s19, [x0, #8]
+        add             x0, x0, x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h8_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v4.16b, v4.16b, #1
+        ext             v6.16b, v4.16b, v4.16b, #2
+        ext             v7.16b, v4.16b, v4.16b, #3
+        zip1            v20.4s, v4.4s, v6.4s
+        zip1            v21.4s, v5.4s, v7.4s
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        usdot           v16.4s, v20.16b, v30.16b
+        usdot           v17.4s, v21.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn             v17.4h, v17.4s
+        st2             {v16.4h, v17.4h}, [x0], x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h12_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v4.16b, v4.16b, #1
+        ext             v6.16b, v4.16b, v4.16b, #2
+        ext             v7.16b, v4.16b, v4.16b, #3
+        trn1            v20.2d, v4.2d, v6.2d
+        trn2            v22.2d, v4.2d, v6.2d
+        trn1            v21.2d, v5.2d, v7.2d
+        trn2            v23.2d, v5.2d, v7.2d
+        trn1            v4.4s, v20.4s, v21.4s
+        trn2            v5.4s, v20.4s, v21.4s
+        trn1            v6.4s, v22.4s, v23.4s
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        usdot           v16.4s, v4.16b, v30.16b
+        usdot           v17.4s, v5.16b, v30.16b
+        usdot           v18.4s, v6.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v17.4s
+        xtn             v18.4h, v18.4s
+        str             q16, [x0]
+        str             d18, [x0, #16]
+        add             x0, x0, x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h16_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v0.16b, v1.16b, #1
+        ext             v6.16b, v0.16b, v1.16b, #2
+        ext             v7.16b, v0.16b, v1.16b, #3
+        zip1            v20.4s, v0.4s, v6.4s
+        zip2            v22.4s, v0.4s, v6.4s
+        zip1            v21.4s, v5.4s, v7.4s
+        zip2            v23.4s, v5.4s, v7.4s
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        usdot           v16.4s, v20.16b, v30.16b
+        usdot           v17.4s, v21.16b, v30.16b
+        usdot           v18.4s, v22.16b, v30.16b
+        usdot           v19.4s, v23.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v18.4s
+        xtn             v17.4h, v17.4s
+        xtn2            v17.8h, v19.4s
+        st2             {v16.8h, v17.8h}, [x0], x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h24_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v0.16b, v1.16b, #1
+        ext             v6.16b, v0.16b, v1.16b, #2
+        ext             v7.16b, v0.16b, v1.16b, #3
+        ext             v26.16b, v1.16b, v1.16b, #1
+        ext             v27.16b, v1.16b, v1.16b, #2
+        ext             v28.16b, v1.16b, v1.16b, #3
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v16.4s, v0.16b, v30.16b
+        usdot           v17.4s, v5.16b, v30.16b
+        usdot           v18.4s, v6.16b, v30.16b
+        usdot           v19.4s, v7.16b, v30.16b
+        usdot           v20.4s, v1.16b, v30.16b
+        usdot           v21.4s, v26.16b, v30.16b
+        usdot           v22.4s, v27.16b, v30.16b
+        usdot           v23.4s, v28.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v20.4s
+        xtn             v17.4h, v17.4s
+        xtn2            v17.8h, v21.4s
+        xtn             v18.4h, v18.4s
+        xtn2            v18.8h, v22.4s
+        xtn             v19.4h, v19.4s
+        xtn2            v19.8h, v23.4s
+        zip1            v20.8h, v16.8h, v18.8h
+        zip1            v21.8h, v17.8h, v19.8h
+        zip2            v22.8h, v16.8h, v18.8h
+        zip2            v23.8h, v17.8h, v19.8h
+        zip1            v22.8h, v22.8h, v23.8h
+        add             x7, x0, #32
+        st2             {v20.8h, v21.8h}, [x0], x10
+        st1             {v22.8h}, [x7]
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h32_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b, v2.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v0.16b, v1.16b, #1
+        ext             v6.16b, v0.16b, v1.16b, #2
+        ext             v7.16b, v0.16b, v1.16b, #3
+        ext             v26.16b, v1.16b, v2.16b, #1
+        ext             v27.16b, v1.16b, v2.16b, #2
+        ext             v28.16b, v1.16b, v2.16b, #3
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v16.4s, v0.16b, v30.16b
+        usdot           v17.4s, v5.16b, v30.16b
+        usdot           v18.4s, v6.16b, v30.16b
+        usdot           v19.4s, v7.16b, v30.16b
+        usdot           v20.4s, v1.16b, v30.16b
+        usdot           v21.4s, v26.16b, v30.16b
+        usdot           v22.4s, v27.16b, v30.16b
+        usdot           v23.4s, v28.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v20.4s
+        xtn             v17.4h, v17.4s
+        xtn2            v17.8h, v21.4s
+        xtn             v18.4h, v18.4s
+        xtn2            v18.8h, v22.4s
+        xtn             v19.4h, v19.4s
+        xtn2            v19.8h, v23.4s
+        st4             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v0.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        movi            v26.2d, #0
+        movi            v27.2d, #0
+        usdot           v24.4s, v1.16b, v30.16b
+        usdot           v25.4s, v16.16b, v30.16b
+        usdot           v26.4s, v17.16b, v30.16b
+        usdot           v27.4s, v18.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v24.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v25.4s
+        xtn             v22.4h, v22.4s
+        xtn2            v22.8h, v26.4s
+        xtn             v23.4h, v23.4s
+        xtn2            v23.8h, v27.4s
+        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], x10
+        ext             v4.16b, v2.16b, v3.16b, #1
+        ext             v5.16b, v2.16b, v3.16b, #2
+        ext             v6.16b, v2.16b, v3.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v2.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v22.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v23.4s
+        add             x7, x0, #64
+        st2             {v20.8h, v21.8h}, [x7]
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+        sub             x2, x2, #64
+1:      ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+        subs            w3, w3, #1   // height
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v0.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        movi            v26.2d, #0
+        movi            v27.2d, #0
+        usdot           v24.4s, v1.16b, v30.16b
+        usdot           v25.4s, v16.16b, v30.16b
+        usdot           v26.4s, v17.16b, v30.16b
+        usdot           v27.4s, v18.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v24.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v25.4s
+        xtn             v22.4h, v22.4s
+        xtn2            v22.8h, v26.4s
+        xtn             v23.4h, v23.4s
+        xtn2            v23.8h, v27.4s
+        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        ld1             {v7.8b}, [x1], x2
+        ext             v4.16b, v2.16b, v3.16b, #1
+        ext             v5.16b, v2.16b, v3.16b, #2
+        ext             v6.16b, v2.16b, v3.16b, #3
+        ext             v16.16b, v3.16b, v7.16b, #1
+        ext             v17.16b, v3.16b, v7.16b, #2
+        ext             v18.16b, v3.16b, v7.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v2.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        movi            v26.2d, #0
+        movi            v27.2d, #0
+        usdot           v24.4s, v3.16b, v30.16b
+        usdot           v25.4s, v16.16b, v30.16b
+        usdot           v26.4s, v17.16b, v30.16b
+        usdot           v27.4s, v18.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v24.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v25.4s
+        xtn             v22.4h, v22.4s
+        xtn2            v22.8h, v26.4s
+        xtn             v23.4h, v23.4s
+        xtn2            v23.8h, v27.4s
+        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        b.ne            1b
+        ret
+endfunc
+
 .macro EPEL_UNI_W_H_HEADER
         ldr             x12, [sp]
         sub             x2, x2, #1
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4a260e1d9a..b448d755b9 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -171,6 +171,10 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  
ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_h, (int16_t *dst,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -283,6 +287,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
         if (have_i8mm(cpu_flags)) {
+            NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
-- 
2.38.0.windows.1

From 7c86c8aef2b718bf8a163614764943aa2a62df0c Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Sun, 28 May 2023 10:35:43 +0800
Subject: [PATCH v1 5/5] lavc/aarch64: new optimization for 8-bit
 hevc_epel_uni_w_hv

---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 668 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   6 +
 2 files changed, 674 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 8942a41cbf..93fb69cc24 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -717,6 +717,674 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, 
export=1
         ret
 endfunc
 
+.macro epel_uni_w_hv_start
+        mov             x15, x5         //denom
+        mov             x16, x6         //wx
+        mov             x17, x7         //ox
+        add             w15, w15, #6    //shift = denom+6
+
+
+        ldp             x5, x6, [sp]
+        ldr             x7, [sp, #16]
+
+        stp             d14, d15, [sp, #-64]!
+        stp             d8, d9, [sp, #16]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #48]
+
+        dup             v13.8h, w16     //wx
+        dup             v14.4s, w17     //ox
+
+        mov             w17, #1
+        lsl             w17, w17, w15
+        lsr             w17, w17, #1
+        dup             v15.4s, w17
+
+        neg             w15, w15        // -shift
+        dup             v12.4s, w15     //shift
+.endm
+
+.macro epel_uni_w_hv_end
+        smull           v28.4s, v4.4h, v13.4h
+        smull2          v29.4s, v4.8h, v13.8h
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        sqxtn           v4.4h, v28.4s
+        sqxtn2          v4.8h, v29.4s
+.endm
+
+.macro epel_uni_w_hv_end2
+        smull           v28.4s, v4.4h, v13.4h
+        smull2          v29.4s, v4.8h, v13.8h
+        smull           v30.4s, v5.4h, v13.4h
+        smull2          v31.4s, v5.8h, v13.8h
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        add             v30.4s, v30.4s, v15.4s
+        add             v31.4s, v31.4s, v15.4s
+
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        sshl            v30.4s, v30.4s, v12.4s
+        sshl            v31.4s, v31.4s, v12.4s
+
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        add             v30.4s, v30.4s, v14.4s
+        add             v31.4s, v31.4s, v14.4s
+
+        sqxtn           v4.4h, v28.4s
+        sqxtn2          v4.8h, v29.4s
+        sqxtn           v5.4h, v30.4s
+        sqxtn2          v5.8h, v31.4s
+.endm
+
+.macro epel_uni_w_hv_end3
+        smull           v1.4s,  v4.4h, v13.4h
+        smull2          v2.4s,  v4.8h, v13.8h
+        smull           v28.4s, v5.4h, v13.4h
+        smull2          v29.4s, v5.8h, v13.8h
+        smull           v30.4s, v6.4h, v13.4h
+        smull2          v31.4s, v6.8h, v13.8h
+        add             v1.4s, v1.4s, v15.4s
+        add             v2.4s, v2.4s, v15.4s
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        add             v30.4s, v30.4s, v15.4s
+        add             v31.4s, v31.4s, v15.4s
+
+        sshl            v1.4s, v1.4s, v12.4s
+        sshl            v2.4s, v2.4s, v12.4s
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        sshl            v30.4s, v30.4s, v12.4s
+        sshl            v31.4s, v31.4s, v12.4s
+        add             v1.4s, v1.4s, v14.4s
+        add             v2.4s, v2.4s, v14.4s
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        add             v30.4s, v30.4s, v14.4s
+        add             v31.4s, v31.4s, v14.4s
+
+        sqxtn           v4.4h, v1.4s
+        sqxtn2          v4.8h, v2.4s
+        sqxtn           v5.4h, v28.4s
+        sqxtn2          v5.8h, v29.4s
+        sqxtn           v6.4h, v30.4s
+        sqxtn2          v6.8h, v31.4s
+.endm
+
+.macro calc_epelh dst, src0, src1, src2, src3
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        sqshrn          \dst\().4h, \dst\().4s, #6
+.endm
+
+.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
+        smull2          \tmp\().4s, \src0\().8h, v0.h[0]
+        smlal2          \tmp\().4s, \src1\().8h, v0.h[1]
+        smlal2          \tmp\().4s, \src2\().8h, v0.h[2]
+        smlal2          \tmp\().4s, \src3\().8h, v0.h[3]
+        sqshrn2         \dst\().8h, \tmp\().4s, #6
+.endm
+
+.macro load_epel_filterh freg, xreg
+        movrel          \xreg, epel_filters
+        add             \xreg, \xreg, \freg, lsl #2
+        ld1             {v0.8b}, [\xreg]
+        sxtl            v0.8h, v0.8b
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldr             x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.4h}, [sp], x10
+        ld1             {v17.4h}, [sp], x10
+        ld1             {v18.4h}, [sp], x10
+1:      ld1             {v19.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+
+        ld1             {v16.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+
+        ld1             {v17.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+
+        ld1             {v18.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.ne            1b
+2:
+        ldp             d8, d9, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d12, d13, [sp, #48]
+        ldp             d14, d15, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv6_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldr             x30, [sp], #48
+        load_epel_filterh x6, x5
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h}, [sp], x10
+        ld1             {v17.8h}, [sp], x10
+        ld1             {v18.8h}, [sp], x10
+1:      ld1             {v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.ne            1b
+2:
+        ldp             d8, d9, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d12, d13, [sp, #48]
+        ldp             d14, d15, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv8_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldr             x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h}, [sp], x10
+        ld1             {v17.8h}, [sp], x10
+        ld1             {v18.8h}, [sp], x10
+1:      ld1             {v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.ne            1b
+2:
+        ldp             d8, d9, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d12, d13, [sp, #48]
+        ldp             d14, d15, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv12_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldr             x30, [sp], #48
+        load_epel_filterh x6, x5
+        sub             x1, x1, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+1:      ld1             {v22.8h, v23.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v20.8h, v21.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.ne            1b
+2:
+        ldp             d8, d9, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d12, d13, [sp, #48]
+        ldp             d14, d15, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldr             x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+1:      ld1             {v22.8h, v23.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        calc_epelh2     v5, v6, v17, v19, v21, v23
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        calc_epelh2     v5, v6, v19, v21, v23, v17
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        calc_epelh2     v5, v6, v21, v23, v17, v19
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v20.8h, v21.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        calc_epelh2     v5, v6, v23, v17, v19, v21
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.ne            1b
+2:
+        ldp             d8, d9, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d12, d13, [sp, #48]
+        ldp             d14, d15, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldr             x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
+1:      ld1             {v25.8h, v26.8h, v27.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v19, v22, v25
+        calc_epelh2     v4, v5, v16, v19, v22, v25
+        calc_epelh      v5, v17, v20, v23, v26
+        calc_epelh2     v5, v6, v17, v20, v23, v26
+        calc_epelh      v6, v18, v21, v24, v27
+        calc_epelh2     v6, v7, v18, v21, v24, v27
+
+        epel_uni_w_hv_end3
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v22, v25, v16
+        calc_epelh2     v4, v5, v19, v22, v25, v16
+        calc_epelh      v5, v20, v23, v26, v17
+        calc_epelh2     v5, v6, v20, v23, v26, v17
+        calc_epelh      v6, v21, v24, v27, v18
+        calc_epelh2     v6, v7, v21, v24, v27, v18
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v22, v25, v16, v19
+        calc_epelh2     v4, v5, v22, v25, v16, v19
+        calc_epelh      v5, v23, v26, v17, v20
+        calc_epelh2     v5, v6, v23, v26, v17, v20
+        calc_epelh      v6, v24, v27, v18, v21
+        calc_epelh2     v6, v7, v24, v27, v18, v21
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v25, v16, v19, v22
+        calc_epelh2     v4, v5, v25, v16, v19, v22
+        calc_epelh      v5, v26, v17, v20, v23
+        calc_epelh2     v5, v6, v26, v17, v20, v23
+        calc_epelh      v6, v27, v18, v21, v24
+        calc_epelh2     v6, v7, v27, v18, v21, v24
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.ne            1b
+2:
+        ldp             d8, d9, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d12, d13, [sp, #48]
+        ldp             d14, d15, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        mov             x17, #16
+        stp             x15, x16, [sp, #-96]!
+        stp             x0, x30, [sp, #16]
+        stp             x1, x2, [sp, #32]
+        stp             x3, x4, [sp, #48]
+        stp             x5, x6, [sp, #64]
+        stp             x17, x7, [sp, #80]
+
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x0, x30, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldp             x3, x4, [sp, #48]
+        ldp             x5, x6, [sp, #64]
+        ldp             x17, x7, [sp, #80]
+        ldp             x15, x16, [sp], #96
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x17, #16
+        stp             x15, x16, [sp, #-32]!
+        stp             x17, x30, [sp, #16]
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x17, x30, [sp, #16]
+        ldp             x15, x16, [sp], #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv48_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        mov             x17, #24
+        stp             x15, x16, [sp, #-96]!
+        stp             x0, x30, [sp, #16]
+        stp             x1, x2, [sp, #32]
+        stp             x3, x4, [sp, #48]
+        stp             x5, x6, [sp, #64]
+        stp             x17, x7, [sp, #80]
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
+        ldp             x0, x30, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldp             x3, x4, [sp, #48]
+        ldp             x5, x6, [sp, #64]
+        ldp             x17, x7, [sp, #80]
+        ldp             x15, x16, [sp], #96
+        add             x0, x0, #24
+        add             x2, x2, #24
+        mov             x17, #24
+        stp             x15, x16, [sp, #-32]!
+        stp             x17, x30, [sp, #16]
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
+        ldp             x17, x30, [sp, #16]
+        ldp             x15, x16, [sp], #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv64_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        mov             x17, #32
+        stp             x15, x16, [sp, #-96]!
+        stp             x0, x30, [sp, #16]
+        stp             x1, x2, [sp, #32]
+        stp             x3, x4, [sp, #48]
+        stp             x5, x6, [sp, #64]
+        stp             x17, x7, [sp, #80]
+
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm)
+        ldp             x0, x30, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldp             x3, x4, [sp, #48]
+        ldp             x5, x6, [sp, #64]
+        ldp             x17, x7, [sp, #80]
+        ldp             x15, x16, [sp], #96
+        add             x0, x0, #32
+        add             x2, x2, #32
+        mov             x17, #32
+        stp             x15, x16, [sp, #-32]!
+        stp             x17, x30, [sp, #16]
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm)
+        ldp             x17, x30, [sp, #16]
+        ldp             x15, x16, [sp], #32
+        ret
+endfunc
+
+
 #endif
 
 
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index b448d755b9..e125b0cfb2 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t 
_dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width), _i8mm);
 
+NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -291,6 +296,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm);
             NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, 
qpel_uni_w_hv, _i8mm);
         }
 
-- 
2.38.0.windows.1

From a654b41fd8b100f631db49bd419ef65594ef32b3 Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Sun, 7 May 2023 16:58:30 +0800
Subject: [PATCH v1 1/5] lavc/aarch64: new optimization for 8-bit
 hevc_pel_uni_pixels

---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 ++
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 104 ++++++++++++++++++++++
 2 files changed, 109 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 483a9d5253..5a1d520eec 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -152,6 +152,9 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, 
ptrdiff_t _dststride, co
     void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
     void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
 
+NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
 
 NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
@@ -263,6 +266,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
         c->put_hevc_qpel_bi[8][0][1]   =
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
 
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index ed659cfe9b..ed5b5027db 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -490,6 +490,110 @@ put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
 
+function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
+1:
+        ldr             s0, [x2]
+        ldr             s1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             s0, [x0]
+        str             s1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1
+        sub             x1, x1, #4
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             s0, [x0], #4
+        st1             {v0.h}[2], [x0], x1
+        str             s1, [x0], #4
+        st1             {v1.h}[2], [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             d0, [x0]
+        str             d1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1
+        sub             x1, x1, #8
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             d0, [x0], #8
+        st1             {v0.s}[2], [x0], x1
+        str             d1, [x0], #8
+        st1             {v1.s}[2], [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             q0, [x0]
+        str             q1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1
+1:
+        ld1             {v0.8b, v1.8b, v2.8b}, [x2], x3
+        subs            w4, w4, #1
+        st1             {v0.8b, v1.8b, v2.8b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        subs            w4, w4, #1
+        st1             {v0.16b, v1.16b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1
+1:
+        ld1             {v0.16b, v1.16b, v2.16b}, [x2], x3
+        subs            w4, w4, #1
+        st1             {v0.16b, v1.16b, v2.16b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
+1:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
 
 function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
         mov             w10, #-6
-- 
2.38.0.windows.1

From 9985cbcc0aa402d9920dd690b6f6a71392d62f79 Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Sun, 28 May 2023 10:07:28 +0800
Subject: [PATCH v1 2/5] lavc/aarch64: new optimization for 8-bit
 hevc_epel_uni_w_h

---
 libavcodec/aarch64/Makefile               |   1 +
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 377 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   7 +-
 3 files changed, 384 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/hevcdsp_epel_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 216191640c..cb428b49e0 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -69,4 +69,5 @@ NEON-OBJS-$(CONFIG_HEVC_DECODER)        += 
aarch64/hevcdsp_deblock_neon.o      \
                                            aarch64/hevcdsp_idct_neon.o         
\
                                            aarch64/hevcdsp_init_aarch64.o      
\
                                            aarch64/hevcdsp_qpel_neon.o         
\
+                                           aarch64/hevcdsp_epel_neon.o         
\
                                            aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
new file mode 100644
index 0000000000..0411de9864
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -0,0 +1,377 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+const epel_filters, align=4
+        .byte  0,  0,  0,  0
+        .byte -2, 58, 10, -2
+        .byte -4, 54, 16, -2
+        .byte -6, 46, 28, -4
+        .byte -4, 36, 36, -4
+        .byte -4, 28, 46, -6
+        .byte -2, 16, 54, -4
+        .byte -2, 10, 58, -2
+endconst
+
+#if HAVE_I8MM
+.macro EPEL_UNI_W_H_HEADER
+        ldr             x12, [sp]
+        sub             x2, x2, #1
+        movrel          x9, epel_filters
+        add             x9, x9, x12, lsl #2
+        ld1r            {v28.4s}, [x9]
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.4s, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+.endm
+
+
+function ff_hevc_put_hevc_epel_uni_w_h4_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.8b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v1.8b, v0.8b, v0.8b, #1
+        ext             v2.8b, v0.8b, v0.8b, #2
+        ext             v3.8b, v0.8b, v0.8b, #3
+        trn1            v0.2s, v0.2s, v2.2s
+        trn1            v1.2s, v1.2s, v3.2s
+        zip1            v0.4s, v0.4s, v1.4s
+        movi            v16.2d, #0
+        usdot           v16.4s, v0.16b, v28.16b
+        mul             v16.4s, v16.4s, v30.4s
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtun          v16.8b, v16.8h
+        str             s16, [x0]
+        add             x0, x0, x1
+        b.hi            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_epel_uni_w_h6_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+        sub             x1, x1, #4
+1:
+        ld1             {v0.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        trn1            v4.2s, v0.2s, v1.2s
+        trn2            v6.2s, v0.2s, v1.2s
+        trn1            v5.2s, v2.2s, v3.2s
+        zip1            v4.2d, v4.2d, v5.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        usdot           v16.4s, v4.16b, v28.16b
+        usdot           v17.2s, v6.8b, v28.8b
+        mul             v16.4s, v16.4s, v30.4s
+        mul             v17.2s, v17.2s, v30.2s
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqrshl          v17.2s, v17.2s, v31.2s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqadd           v17.2s, v17.2s, v29.2s
+        sqxtn           v16.4h, v16.4s
+        sqxtn2          v16.8h, v17.4s
+        sqxtun          v16.8b, v16.8h
+        str             s16, [x0], #4
+        st1             {v16.h}[2], [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+.macro  EPEL_UNI_W_H_CALC s0, s1, d0, d1
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        usdot           \d0\().4s, \s0\().16b, v28.16b
+        usdot           \d1\().4s, \s1\().16b, v28.16b
+        mul             \d0\().4s, \d0\().4s, v30.4s
+        mul             \d1\().4s, \d1\().4s, v30.4s
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqrshl          \d1\().4s, \d1\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+        sqadd           \d1\().4s, \d1\().4s, v29.4s
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_h8_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        zip1            v4.4s, v0.4s, v2.4s
+        zip1            v5.4s, v1.4s, v3.4s
+        EPEL_UNI_W_H_CALC v4, v5, v16, v17
+        sqxtn           v16.4h, v16.4s
+        sqxtn           v17.4h, v17.4s
+        zip1            v16.8h, v16.8h, v17.8h
+        sqxtun          v16.8b, v16.8h
+        str             d16, [x0]
+        add             x0, x0, x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h12_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        zip1            v4.4s, v0.4s, v2.4s
+        zip1            v5.4s, v1.4s, v3.4s
+        zip2            v6.4s, v0.4s, v2.4s
+        zip2            v7.4s, v1.4s, v3.4s
+        zip1            v6.4s, v6.4s, v7.4s
+        EPEL_UNI_W_H_CALC v4, v5, v16, v17
+        movi            v18.2d, #0
+        usdot           v18.4s, v6.16b, v28.16b
+        mul             v18.4s, v18.4s, v30.4s
+        sqrshl          v18.4s, v18.4s, v31.4s
+        sqadd           v18.4s, v18.4s, v29.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtn           v17.4h, v17.4s
+        sqxtn           v18.4h, v18.4s
+        zip1            v16.8h, v16.8h, v17.8h
+        sqxtun          v16.8b, v16.8h
+        sqxtun          v18.8b, v18.8h
+        str             d16, [x0]
+        str             s18, [x0, #8]
+        add             x0, x0, x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h16_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        zip1            v20.4s, v0.4s, v5.4s
+        zip1            v21.4s, v4.4s, v6.4s
+        zip2            v22.4s, v0.4s, v5.4s
+        zip2            v23.4s, v4.4s, v6.4s
+        EPEL_UNI_W_H_CALC v20, v21, v16, v17
+        EPEL_UNI_W_H_CALC v22, v23, v18, v19
+        sqxtn           v16.4h, v16.4s
+        sqxtn           v17.4h, v17.4s
+        sqxtn2          v16.8h, v18.4s
+        sqxtn2          v17.8h, v19.4s
+        sqxtun          v16.8b, v16.8h
+        sqxtun          v17.8b, v17.8h
+        st2             {v16.8b, v17.8b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h24_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v2.16b, v0.16b, v1.16b, #1
+        ext             v3.16b, v0.16b, v1.16b, #2
+        ext             v4.16b, v0.16b, v1.16b, #3
+        ext             v5.16b, v1.16b, v1.16b, #1
+        ext             v6.16b, v1.16b, v1.16b, #2
+        ext             v7.16b, v1.16b, v1.16b, #3
+        zip1            v20.4s, v0.4s, v3.4s
+        zip1            v21.4s, v2.4s, v4.4s
+        zip2            v22.4s, v0.4s, v3.4s
+        zip2            v23.4s, v2.4s, v4.4s
+        zip1            v24.4s, v1.4s, v6.4s
+        zip1            v25.4s, v5.4s, v7.4s
+        EPEL_UNI_W_H_CALC v20, v21, v16, v17
+        EPEL_UNI_W_H_CALC v22, v23, v18, v19
+        EPEL_UNI_W_H_CALC v24, v25, v26, v27
+        sqxtn           v16.4h, v16.4s
+        sqxtn           v17.4h, v17.4s
+        sqxtn           v18.4h, v18.4s
+        sqxtn           v19.4h, v19.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn           v27.4h, v27.4s
+        zip1            v16.8h, v16.8h, v17.8h
+        zip1            v18.8h, v18.8h, v19.8h
+        zip1            v26.8h, v26.8h, v27.8h
+        sqxtun          v16.8b, v16.8h
+        sqxtun2         v16.16b, v18.8h
+        sqxtun          v26.8b, v26.8h
+        str             q16, [x0]
+        str             d26, [x0, #16]
+        add             x0, x0, x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h32_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b, v1.16b, v2.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v3.16b, v0.16b, v1.16b, #1
+        ext             v4.16b, v0.16b, v1.16b, #2
+        ext             v5.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        EPEL_UNI_W_H_CALC v0, v3, v6, v7
+        EPEL_UNI_W_H_CALC v4, v5, v19, v20
+        EPEL_UNI_W_H_CALC v1, v16, v21, v22
+        EPEL_UNI_W_H_CALC v17, v18, v23, v24
+        sqxtn           v6.4h, v6.4s
+        sqxtn2          v6.8h, v21.4s
+        sqxtn           v7.4h, v7.4s
+        sqxtn2          v7.8h, v22.4s
+        sqxtn           v19.4h, v19.4s
+        sqxtn2          v19.8h, v23.4s
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v24.4s
+        sqxtun          v0.8b, v6.8h
+        sqxtun          v1.8b, v7.8h
+        sqxtun          v2.8b, v19.8h
+        sqxtun          v3.8b, v20.8h
+        st4             {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_epel_uni_w_h48_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+        sub             x1, x1, #32
+1:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        EPEL_UNI_W_H_CALC v0, v4, v19, v20
+        EPEL_UNI_W_H_CALC v5, v6, v21, v22
+        EPEL_UNI_W_H_CALC v1, v16, v23, v24
+        EPEL_UNI_W_H_CALC v17, v18, v25, v26
+        sqxtn           v19.4h, v19.4s
+        sqxtn2          v19.8h, v23.4s
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v24.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v25.4s
+        sqxtn           v22.4h, v22.4s
+        sqxtn2          v22.8h, v26.4s
+        sqxtun          v19.8b, v19.8h
+        sqxtun          v20.8b, v20.8h
+        sqxtun          v21.8b, v21.8h
+        sqxtun          v22.8b, v22.8h
+        st4             {v19.8b, v20.8b, v21.8b, v22.8b}, [x0], #32
+        ext             v5.16b, v2.16b, v3.16b, #1
+        ext             v6.16b, v2.16b, v3.16b, #2
+        ext             v7.16b, v2.16b, v3.16b, #3
+        EPEL_UNI_W_H_CALC v2, v5, v19, v20
+        EPEL_UNI_W_H_CALC v6, v7, v21, v22
+        sqxtn           v19.4h, v19.4s
+        sqxtn           v20.4h, v20.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn           v22.4h, v22.4s
+        zip1            v4.8h, v19.8h, v21.8h
+        zip1            v5.8h, v20.8h, v22.8h
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        st2             {v4.8b, v5.8b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+        sub             x1, x1, #32
+        sub             x3, x3, #64
+1:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
+        subs            w4, w4, #1
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        EPEL_UNI_W_H_CALC v0, v4, v19, v20
+        EPEL_UNI_W_H_CALC v5, v6, v21, v22
+        EPEL_UNI_W_H_CALC v1, v16, v23, v24
+        EPEL_UNI_W_H_CALC v17, v18, v25, v26
+        sqxtn           v19.4h, v19.4s
+        sqxtn2          v19.8h, v23.4s
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v24.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v25.4s
+        sqxtn           v22.4h, v22.4s
+        sqxtn2          v22.8h, v26.4s
+        sqxtun          v19.8b, v19.8h
+        sqxtun          v20.8b, v20.8h
+        sqxtun          v21.8b, v21.8h
+        sqxtun          v22.8b, v22.8h
+        st4             {v19.8b, v20.8b, v21.8b, v22.8b}, [x0], #32
+        ld1             {v7.8b}, [x2], x3
+        ext             v4.16b, v2.16b, v3.16b, #1
+        ext             v5.16b, v2.16b, v3.16b, #2
+        ext             v6.16b, v2.16b, v3.16b, #3
+        ext             v16.16b, v3.16b, v7.16b, #1
+        ext             v17.16b, v3.16b, v7.16b, #2
+        ext             v18.16b, v3.16b, v7.16b, #3
+        EPEL_UNI_W_H_CALC v2, v4, v19, v20
+        EPEL_UNI_W_H_CALC v5, v6, v21, v22
+        EPEL_UNI_W_H_CALC v3, v16, v23, v24
+        EPEL_UNI_W_H_CALC v17, v18, v25, v26
+        sqxtn           v19.4h, v19.4s
+        sqxtn2          v19.8h, v23.4s
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v24.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v25.4s
+        sqxtn           v22.4h, v22.4s
+        sqxtn2          v22.8h, v26.4s
+        sqxtun          v19.8b, v19.8h
+        sqxtun          v20.8b, v20.8h
+        sqxtun          v21.8b, v21.8h
+        sqxtun          v22.8b, v22.8h
+        st4             {v19.8b, v20.8b, v21.8b, v22.8b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+#endif
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 5a1d520eec..8af0a2b4b9 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -166,6 +166,10 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  
ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _i8mm);
 
 NEON8_FNPROTO(qpel_h, (int16_t *dst,
         const uint8_t *_src, ptrdiff_t _srcstride,
@@ -273,8 +277,9 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
         if (have_i8mm(cpu_flags)) {
-            NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
             NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, 
qpel_uni_w_hv, _i8mm);
         }
 
-- 
2.38.0.windows.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to