aarch64: new optimization for 8-bit hevc_epel_uni_w_hv

Logan.Lyu Sun, 18 Jun 2023 01:25:40 -0700

Hi, Martin,

I modified it according to your comments. Please review again.


And here are the checkasm benchmark results of the related functions:

put_hevc_epel_uni_w_hv4_8_c: 254.6
put_hevc_epel_uni_w_hv4_8_i8mm: 102.9
put_hevc_epel_uni_w_hv6_8_c: 411.6
put_hevc_epel_uni_w_hv6_8_i8mm: 221.6
put_hevc_epel_uni_w_hv8_8_c: 669.4
put_hevc_epel_uni_w_hv8_8_i8mm: 214.9
put_hevc_epel_uni_w_hv12_8_c: 1412.6
put_hevc_epel_uni_w_hv12_8_i8mm: 481.4
put_hevc_epel_uni_w_hv16_8_c: 2425.4
put_hevc_epel_uni_w_hv16_8_i8mm: 647.4
put_hevc_epel_uni_w_hv24_8_c: 5384.1
put_hevc_epel_uni_w_hv24_8_i8mm: 1450.6
put_hevc_epel_uni_w_hv32_8_c: 9470.9
put_hevc_epel_uni_w_hv32_8_i8mm: 2497.1
put_hevc_epel_uni_w_hv48_8_c: 20930.1
put_hevc_epel_uni_w_hv48_8_i8mm: 5635.9
put_hevc_epel_uni_w_hv64_8_c: 36682.9
put_hevc_epel_uni_w_hv64_8_i8mm: 9712.6



在 2023/6/12 16:19, Martin Storsjö 写道:

On Sun, 4 Jun 2023, logan....@myais.com.cn wrote:

From: Logan Lyu <logan....@myais.com.cn>

Signed-off-by: Logan Lyu <logan....@myais.com.cn>
---
libavcodec/aarch64/hevcdsp_epel_neon.S    | 703 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c |   7 +
2 files changed, 710 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.Sb/libavcodec/aarch64/hevcdsp_epel_neon.S

index 32f052a7b1..24a74d2c7d 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S

@@ -718,6 +718,709 @@ functionff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1

        ret
endfunc

+.macro epel_uni_w_hv_start
+        mov             x15, x5         //denom
+        mov             x16, x6         //wx
+        mov             x17, x7         //ox
+        add             w15, w15, #6    //shift = denom+6
+
+
+        ldp             x5, x6, [sp]
+        ldp             x7, xzr, [sp, #16]


Why ldp into xzr, that seems pointless?

+
+        sub             sp, sp, #128
+        stp             q12, q13, [sp]


This could be "stp q12, q13, [sp, #-128]!"

+        stp             q14, q15, [sp, #32]
+        stp             q8, q9,   [sp, #64]
+        stp             q10, q11, [sp, #96]
+
+        dup             v13.8h, w16     //wx
+        dup             v14.4s, w17     //ox
+
+        mov             w17, #1
+        lsl             w17, w17, w15
+        lsr             w17, w17, #1
+        dup             v15.4s, w17
+
+        neg             w15, w15        // -shift
+        dup             v12.4s, w15     //shift
+.endm
+
+.macro epel_uni_w_hv_end
+        smull           v28.4s, v4.4h, v13.4h
+        smull2          v29.4s, v4.8h, v13.8h
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        sqxtn           v4.4h, v28.4s
+        sqxtn2          v4.8h, v29.4s
+.endm
+
+.macro epel_uni_w_hv_end2
+        smull           v28.4s, v4.4h, v13.4h
+        smull2          v29.4s, v4.8h, v13.8h
+        smull           v30.4s, v5.4h, v13.4h
+        smull2          v31.4s, v5.8h, v13.8h
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        add             v30.4s, v30.4s, v15.4s
+        add             v31.4s, v31.4s, v15.4s
+
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        sshl            v30.4s, v30.4s, v12.4s
+        sshl            v31.4s, v31.4s, v12.4s
+
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        add             v30.4s, v30.4s, v14.4s
+        add             v31.4s, v31.4s, v14.4s
+
+        sqxtn           v4.4h, v28.4s
+        sqxtn2          v4.8h, v29.4s
+        sqxtn           v5.4h, v30.4s
+        sqxtn2          v5.8h, v31.4s
+.endm
+
+.macro epel_uni_w_hv_end3
+        smull           v1.4s,  v4.4h, v13.4h
+        smull2          v2.4s,  v4.8h, v13.8h
+        smull           v28.4s, v5.4h, v13.4h
+        smull2          v29.4s, v5.8h, v13.8h
+        smull           v30.4s, v6.4h, v13.4h
+        smull2          v31.4s, v6.8h, v13.8h
+        add             v1.4s, v1.4s, v15.4s
+        add             v2.4s, v2.4s, v15.4s
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        add             v30.4s, v30.4s, v15.4s
+        add             v31.4s, v31.4s, v15.4s
+
+        sshl            v1.4s, v1.4s, v12.4s
+        sshl            v2.4s, v2.4s, v12.4s
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        sshl            v30.4s, v30.4s, v12.4s
+        sshl            v31.4s, v31.4s, v12.4s
+        add             v1.4s, v1.4s, v14.4s
+        add             v2.4s, v2.4s, v14.4s
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        add             v30.4s, v30.4s, v14.4s
+        add             v31.4s, v31.4s, v14.4s
+
+        sqxtn           v4.4h, v1.4s
+        sqxtn2          v4.8h, v2.4s
+        sqxtn           v5.4h, v28.4s
+        sqxtn2          v5.8h, v29.4s
+        sqxtn           v6.4h, v30.4s
+        sqxtn2          v6.8h, v31.4s
+.endm
+
+.macro calc_epelh dst, src0, src1, src2, src3
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        sqshrn          \dst\().4h, \dst\().4s, #6
+.endm
+
+.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
+        smull2          \tmp\().4s, \src0\().8h, v0.h[0]
+        smlal2          \tmp\().4s, \src1\().8h, v0.h[1]
+        smlal2          \tmp\().4s, \src2\().8h, v0.h[2]
+        smlal2          \tmp\().4s, \src3\().8h, v0.h[3]
+        sqshrn2         \dst\().8h, \tmp\().4s, #6
+.endm
+
+.macro load_epel_filterh freg, xreg
+        movrel          \xreg, epel_filters
+        add             \xreg, \xreg, \freg, lsl #2
+        ld1             {v0.8b}, [\xreg]
+        sxtl            v0.8h, v0.8b
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        and             x4, x4, 0xffffffff

What does this "and" do here? Is it a case where the argument is"int", while the upper bits of the register is undefined? In thosecases, you're best off by just using "w4", possibly "w4, uxtw" (orsxtw) instead of manually doing such an "and" here.

+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!

Don't do consecutive decrements like this, but do one "stp ..., [sp,#-48]!" followed by "stp ..., [sp, #16]" etc.

+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.4h}, [sp], x10
+        ld1             {v17.4h}, [sp], x10
+        ld1             {v18.4h}, [sp], x10
+1:      ld1             {v19.4h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v16.4h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v17.4h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v18.4h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:
+        ldp             q12, q13, [sp]
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        add             sp, sp, #128


Fold the stack increment into ldp, like "ldp q12, q13, [sp], #128".

The same thing applies to all other functions in this patch too.

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.cb/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 348497bbbe..fbbc4e6071 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
        int height, int denom, int wx, int ox,
        intptr_t mx, intptr_t my, int width), _i8mm);

+NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _i8mm);
+
NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t_dststride,
        const uint8_t *_src, ptrdiff_t _srcstride,
        int height, int denom, int wx, int ox,
@@ -286,11 +291,13 @@ av_cold voidff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0,qpel_uni_w_v,);
+
        if (have_i8mm(cpu_flags)) {


Stray whitespace change.

// Martin

From 47b7f7af634add7680b56a216fff7dbe1f08cd11 Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Sun, 28 May 2023 10:35:43 +0800
Subject: [PATCH 5/5] lavc/aarch64: new optimization for 8-bit
 hevc_epel_uni_w_hv

Signed-off-by: Logan Lyu <logan....@myais.com.cn>
---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 694 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   6 +
 2 files changed, 700 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 8b6f396a0b..355679af29 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -717,6 +717,700 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, 
export=1
         ret
 endfunc
 
+.macro epel_uni_w_hv_start
+        mov             x15, x5         //denom
+        mov             x16, x6         //wx
+        mov             x17, x7         //ox
+        add             w15, w15, #6    //shift = denom+6
+
+
+        ldp             x5, x6, [sp]
+        ldr             x7, [sp, #16]
+
+        stp             q12, q13, [sp, #-128]!
+        stp             q14, q15, [sp, #32]
+        stp             q8, q9,   [sp, #64]
+        stp             q10, q11, [sp, #96]
+
+        dup             v13.8h, w16     //wx
+        dup             v14.4s, w17     //ox
+
+        mov             w17, #1
+        lsl             w17, w17, w15
+        lsr             w17, w17, #1
+        dup             v15.4s, w17
+
+        neg             w15, w15        // -shift
+        dup             v12.4s, w15     //shift
+.endm
+
+.macro epel_uni_w_hv_end
+        smull           v28.4s, v4.4h, v13.4h
+        smull2          v29.4s, v4.8h, v13.8h
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        sqxtn           v4.4h, v28.4s
+        sqxtn2          v4.8h, v29.4s
+.endm
+
+.macro epel_uni_w_hv_end2
+        smull           v28.4s, v4.4h, v13.4h
+        smull2          v29.4s, v4.8h, v13.8h
+        smull           v30.4s, v5.4h, v13.4h
+        smull2          v31.4s, v5.8h, v13.8h
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        add             v30.4s, v30.4s, v15.4s
+        add             v31.4s, v31.4s, v15.4s
+
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        sshl            v30.4s, v30.4s, v12.4s
+        sshl            v31.4s, v31.4s, v12.4s
+
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        add             v30.4s, v30.4s, v14.4s
+        add             v31.4s, v31.4s, v14.4s
+
+        sqxtn           v4.4h, v28.4s
+        sqxtn2          v4.8h, v29.4s
+        sqxtn           v5.4h, v30.4s
+        sqxtn2          v5.8h, v31.4s
+.endm
+
+.macro epel_uni_w_hv_end3
+        smull           v1.4s,  v4.4h, v13.4h
+        smull2          v2.4s,  v4.8h, v13.8h
+        smull           v28.4s, v5.4h, v13.4h
+        smull2          v29.4s, v5.8h, v13.8h
+        smull           v30.4s, v6.4h, v13.4h
+        smull2          v31.4s, v6.8h, v13.8h
+        add             v1.4s, v1.4s, v15.4s
+        add             v2.4s, v2.4s, v15.4s
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        add             v30.4s, v30.4s, v15.4s
+        add             v31.4s, v31.4s, v15.4s
+
+        sshl            v1.4s, v1.4s, v12.4s
+        sshl            v2.4s, v2.4s, v12.4s
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        sshl            v30.4s, v30.4s, v12.4s
+        sshl            v31.4s, v31.4s, v12.4s
+        add             v1.4s, v1.4s, v14.4s
+        add             v2.4s, v2.4s, v14.4s
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        add             v30.4s, v30.4s, v14.4s
+        add             v31.4s, v31.4s, v14.4s
+
+        sqxtn           v4.4h, v1.4s
+        sqxtn2          v4.8h, v2.4s
+        sqxtn           v5.4h, v28.4s
+        sqxtn2          v5.8h, v29.4s
+        sqxtn           v6.4h, v30.4s
+        sqxtn2          v6.8h, v31.4s
+.endm
+
+.macro calc_epelh dst, src0, src1, src2, src3
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        sqshrn          \dst\().4h, \dst\().4s, #6
+.endm
+
+.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
+        smull2          \tmp\().4s, \src0\().8h, v0.h[0]
+        smlal2          \tmp\().4s, \src1\().8h, v0.h[1]
+        smlal2          \tmp\().4s, \src2\().8h, v0.h[2]
+        smlal2          \tmp\().4s, \src3\().8h, v0.h[3]
+        sqshrn2         \dst\().8h, \tmp\().4s, #6
+.endm
+
+.macro load_epel_filterh freg, xreg
+        movrel          \xreg, epel_filters
+        add             \xreg, \xreg, \freg, lsl #2
+        ld1             {v0.8b}, [\xreg]
+        sxtl            v0.8h, v0.8b
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             xzr, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldp             xzr, x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.4h}, [sp], x10
+        ld1             {v17.4h}, [sp], x10
+        ld1             {v18.4h}, [sp], x10
+1:      ld1             {v19.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+
+        ld1             {v16.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+
+        ld1             {v17.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+
+        ld1             {v18.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.ne            1b
+2:
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        ldp             q12, q13, [sp], #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv6_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             xzr, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldp             xzr, x30, [sp], #48
+        load_epel_filterh x6, x5
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h}, [sp], x10
+        ld1             {v17.8h}, [sp], x10
+        ld1             {v18.8h}, [sp], x10
+1:      ld1             {v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.ne            1b
+2:
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        ldp             q12, q13, [sp], #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv8_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             xzr, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldp             xzr, x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h}, [sp], x10
+        ld1             {v17.8h}, [sp], x10
+        ld1             {v18.8h}, [sp], x10
+1:      ld1             {v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.ne            1b
+2:
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        ldp             q12, q13, [sp], #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv12_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             xzr, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldp             xzr, x30, [sp], #48
+        load_epel_filterh x6, x5
+        sub             x1, x1, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+1:      ld1             {v22.8h, v23.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v20.8h, v21.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.ne            1b
+2:
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        ldp             q12, q13, [sp], #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             xzr, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldp             xzr, x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+1:      ld1             {v22.8h, v23.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        calc_epelh2     v5, v6, v17, v19, v21, v23
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        calc_epelh2     v5, v6, v19, v21, v23, v17
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        calc_epelh2     v5, v6, v21, v23, v17, v19
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v20.8h, v21.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        calc_epelh2     v5, v6, v23, v17, v19, v21
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.ne            1b
+2:
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        ldp             q12, q13, [sp], #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             xzr, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldp             xzr, x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
+1:      ld1             {v25.8h, v26.8h, v27.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v19, v22, v25
+        calc_epelh2     v4, v5, v16, v19, v22, v25
+        calc_epelh      v5, v17, v20, v23, v26
+        calc_epelh2     v5, v6, v17, v20, v23, v26
+        calc_epelh      v6, v18, v21, v24, v27
+        calc_epelh2     v6, v7, v18, v21, v24, v27
+
+        epel_uni_w_hv_end3
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v22, v25, v16
+        calc_epelh2     v4, v5, v19, v22, v25, v16
+        calc_epelh      v5, v20, v23, v26, v17
+        calc_epelh2     v5, v6, v20, v23, v26, v17
+        calc_epelh      v6, v21, v24, v27, v18
+        calc_epelh2     v6, v7, v21, v24, v27, v18
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v22, v25, v16, v19
+        calc_epelh2     v4, v5, v22, v25, v16, v19
+        calc_epelh      v5, v23, v26, v17, v20
+        calc_epelh2     v5, v6, v23, v26, v17, v20
+        calc_epelh      v6, v24, v27, v18, v21
+        calc_epelh2     v6, v7, v24, v27, v18, v21
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v25, v16, v19, v22
+        calc_epelh2     v4, v5, v25, v16, v19, v22
+        calc_epelh      v5, v26, v17, v20, v23
+        calc_epelh2     v5, v6, v26, v17, v20, v23
+        calc_epelh      v6, v27, v18, v21, v24
+        calc_epelh2     v6, v7, v27, v18, v21, v24
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.ne            1b
+2:
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        ldp             q12, q13, [sp], #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x17, #16
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x17, #16
+        stp             x17, xzr, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, xzr, [sp], #16
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv48_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x17, #24
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #24
+        add             x2, x2, #24
+        mov             x17, #24
+        stp             x17, xzr, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, xzr, [sp], #16
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv64_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x17, #16
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x17, #16
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #32
+        add             x2, x2, #32
+        mov             x17, #16
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #48
+        add             x2, x2, #48
+        mov             x17, #16
+        stp             x17, xzr, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, xzr, [sp], #16
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+
 #endif
 
 
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index b448d755b9..e125b0cfb2 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t 
_dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width), _i8mm);
 
+NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -291,6 +296,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm);
             NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, 
qpel_uni_w_hv, _i8mm);
         }
 
-- 
2.38.0.windows.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_w_hv

Reply via email to