Hi, Martin,
I modified it according to your comments. Please review again.
And here are the checkasm benchmark results of the related functions:
put_hevc_epel_uni_w_hv4_8_c: 254.6
put_hevc_epel_uni_w_hv4_8_i8mm: 102.9
put_hevc_epel_uni_w_hv6_8_c: 411.6
put_hevc_epel_uni_w_hv6_8_i8mm: 221.6
put_hevc_epel_uni_w_hv8_8_c: 669.4
put_hevc_epel_uni_w_hv8_8_i8mm: 214.9
put_hevc_epel_uni_w_hv12_8_c: 1412.6
put_hevc_epel_uni_w_hv12_8_i8mm: 481.4
put_hevc_epel_uni_w_hv16_8_c: 2425.4
put_hevc_epel_uni_w_hv16_8_i8mm: 647.4
put_hevc_epel_uni_w_hv24_8_c: 5384.1
put_hevc_epel_uni_w_hv24_8_i8mm: 1450.6
put_hevc_epel_uni_w_hv32_8_c: 9470.9
put_hevc_epel_uni_w_hv32_8_i8mm: 2497.1
put_hevc_epel_uni_w_hv48_8_c: 20930.1
put_hevc_epel_uni_w_hv48_8_i8mm: 5635.9
put_hevc_epel_uni_w_hv64_8_c: 36682.9
put_hevc_epel_uni_w_hv64_8_i8mm: 9712.6
在 2023/6/12 16:19, Martin Storsjö 写道:
On Sun, 4 Jun 2023, logan....@myais.com.cn wrote:
From: Logan Lyu <logan....@myais.com.cn>
Signed-off-by: Logan Lyu <logan....@myais.com.cn>
---
libavcodec/aarch64/hevcdsp_epel_neon.S | 703 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 7 +
2 files changed, 710 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 32f052a7b1..24a74d2c7d 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -718,6 +718,709 @@ function
ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
ret
endfunc
+.macro epel_uni_w_hv_start
+ mov x15, x5 //denom
+ mov x16, x6 //wx
+ mov x17, x7 //ox
+ add w15, w15, #6 //shift = denom+6
+
+
+ ldp x5, x6, [sp]
+ ldp x7, xzr, [sp, #16]
Why ldp into xzr, that seems pointless?
+
+ sub sp, sp, #128
+ stp q12, q13, [sp]
This could be "stp q12, q13, [sp, #-128]!"
+ stp q14, q15, [sp, #32]
+ stp q8, q9, [sp, #64]
+ stp q10, q11, [sp, #96]
+
+ dup v13.8h, w16 //wx
+ dup v14.4s, w17 //ox
+
+ mov w17, #1
+ lsl w17, w17, w15
+ lsr w17, w17, #1
+ dup v15.4s, w17
+
+ neg w15, w15 // -shift
+ dup v12.4s, w15 //shift
+.endm
+
+.macro epel_uni_w_hv_end
+ smull v28.4s, v4.4h, v13.4h
+ smull2 v29.4s, v4.8h, v13.8h
+ add v28.4s, v28.4s, v15.4s
+ add v29.4s, v29.4s, v15.4s
+ sshl v28.4s, v28.4s, v12.4s
+ sshl v29.4s, v29.4s, v12.4s
+ add v28.4s, v28.4s, v14.4s
+ add v29.4s, v29.4s, v14.4s
+ sqxtn v4.4h, v28.4s
+ sqxtn2 v4.8h, v29.4s
+.endm
+
+.macro epel_uni_w_hv_end2
+ smull v28.4s, v4.4h, v13.4h
+ smull2 v29.4s, v4.8h, v13.8h
+ smull v30.4s, v5.4h, v13.4h
+ smull2 v31.4s, v5.8h, v13.8h
+ add v28.4s, v28.4s, v15.4s
+ add v29.4s, v29.4s, v15.4s
+ add v30.4s, v30.4s, v15.4s
+ add v31.4s, v31.4s, v15.4s
+
+ sshl v28.4s, v28.4s, v12.4s
+ sshl v29.4s, v29.4s, v12.4s
+ sshl v30.4s, v30.4s, v12.4s
+ sshl v31.4s, v31.4s, v12.4s
+
+ add v28.4s, v28.4s, v14.4s
+ add v29.4s, v29.4s, v14.4s
+ add v30.4s, v30.4s, v14.4s
+ add v31.4s, v31.4s, v14.4s
+
+ sqxtn v4.4h, v28.4s
+ sqxtn2 v4.8h, v29.4s
+ sqxtn v5.4h, v30.4s
+ sqxtn2 v5.8h, v31.4s
+.endm
+
+.macro epel_uni_w_hv_end3
+ smull v1.4s, v4.4h, v13.4h
+ smull2 v2.4s, v4.8h, v13.8h
+ smull v28.4s, v5.4h, v13.4h
+ smull2 v29.4s, v5.8h, v13.8h
+ smull v30.4s, v6.4h, v13.4h
+ smull2 v31.4s, v6.8h, v13.8h
+ add v1.4s, v1.4s, v15.4s
+ add v2.4s, v2.4s, v15.4s
+ add v28.4s, v28.4s, v15.4s
+ add v29.4s, v29.4s, v15.4s
+ add v30.4s, v30.4s, v15.4s
+ add v31.4s, v31.4s, v15.4s
+
+ sshl v1.4s, v1.4s, v12.4s
+ sshl v2.4s, v2.4s, v12.4s
+ sshl v28.4s, v28.4s, v12.4s
+ sshl v29.4s, v29.4s, v12.4s
+ sshl v30.4s, v30.4s, v12.4s
+ sshl v31.4s, v31.4s, v12.4s
+ add v1.4s, v1.4s, v14.4s
+ add v2.4s, v2.4s, v14.4s
+ add v28.4s, v28.4s, v14.4s
+ add v29.4s, v29.4s, v14.4s
+ add v30.4s, v30.4s, v14.4s
+ add v31.4s, v31.4s, v14.4s
+
+ sqxtn v4.4h, v1.4s
+ sqxtn2 v4.8h, v2.4s
+ sqxtn v5.4h, v28.4s
+ sqxtn2 v5.8h, v29.4s
+ sqxtn v6.4h, v30.4s
+ sqxtn2 v6.8h, v31.4s
+.endm
+
+.macro calc_epelh dst, src0, src1, src2, src3
+ smull \dst\().4s, \src0\().4h, v0.h[0]
+ smlal \dst\().4s, \src1\().4h, v0.h[1]
+ smlal \dst\().4s, \src2\().4h, v0.h[2]
+ smlal \dst\().4s, \src3\().4h, v0.h[3]
+ sqshrn \dst\().4h, \dst\().4s, #6
+.endm
+
+.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
+ smull2 \tmp\().4s, \src0\().8h, v0.h[0]
+ smlal2 \tmp\().4s, \src1\().8h, v0.h[1]
+ smlal2 \tmp\().4s, \src2\().8h, v0.h[2]
+ smlal2 \tmp\().4s, \src3\().8h, v0.h[3]
+ sqshrn2 \dst\().8h, \tmp\().4s, #6
+.endm
+
+.macro load_epel_filterh freg, xreg
+ movrel \xreg, epel_filters
+ add \xreg, \xreg, \freg, lsl #2
+ ld1 {v0.8b}, [\xreg]
+ sxtl v0.8h, v0.8b
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
+ epel_uni_w_hv_start
+ and x4, x4, 0xffffffff
What does this "and" do here? Is it a case where the argument is
"int", while the upper bits of the register is undefined? In those
cases, you're best off by just using "w4", possibly "w4, uxtw" (or
sxtw) instead of manually doing such an "and" here.
+
+ add x10, x4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]!
+ stp x4, x6, [sp, #-16]!
+ stp xzr, x30, [sp, #-16]!
Don't do consecutive decrements like this, but do one "stp ..., [sp,
#-48]!" followed by "stp ..., [sp, #16]" etc.
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add x3, x4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+ ldp xzr, x30, [sp], #16
+ ldp x4, x6, [sp], #16
+ ldp x0, x1, [sp], #16
+ load_epel_filterh x6, x5
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.4h}, [sp], x10
+ ld1 {v17.4h}, [sp], x10
+ ld1 {v18.4h}, [sp], x10
+1: ld1 {v19.4h}, [sp], x10
+ calc_epelh v4, v16, v17, v18, v19
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ str s4, [x0]
+ add x0, x0, x1
+ subs x4, x4, #1
+ b.eq 2f
+
+ ld1 {v16.4h}, [sp], x10
+ calc_epelh v4, v17, v18, v19, v16
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ str s4, [x0]
+ add x0, x0, x1
+ subs x4, x4, #1
+ b.eq 2f
+
+ ld1 {v17.4h}, [sp], x10
+ calc_epelh v4, v18, v19, v16, v17
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ str s4, [x0]
+ add x0, x0, x1
+ subs x4, x4, #1
+ b.eq 2f
+
+ ld1 {v18.4h}, [sp], x10
+ calc_epelh v4, v19, v16, v17, v18
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ str s4, [x0]
+ add x0, x0, x1
+ subs x4, x4, #1
+ b.ne 1b
+2:
+ ldp q12, q13, [sp]
+ ldp q14, q15, [sp, #32]
+ ldp q8, q9, [sp, #64]
+ ldp q10, q11, [sp, #96]
+ add sp, sp, #128
Fold the stack increment into ldp, like "ldp q12, q13, [sp], #128".
The same thing applies to all other functions in this patch too.
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 348497bbbe..fbbc4e6071 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,
ptrdiff_t _dststride,
int height, int denom, int wx, int ox,
intptr_t mx, intptr_t my, int width), _i8mm);
+NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride,
+ const uint8_t *_src, ptrdiff_t _srcstride,
+ int height, int denom, int wx, int ox,
+ intptr_t mx, intptr_t my, int width), _i8mm);
+
NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t
_dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
@@ -286,11 +291,13 @@ av_cold void
ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0,
qpel_uni_w_v,);
+
if (have_i8mm(cpu_flags)) {
Stray whitespace change.
// Martin
From 47b7f7af634add7680b56a216fff7dbe1f08cd11 Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Sun, 28 May 2023 10:35:43 +0800
Subject: [PATCH 5/5] lavc/aarch64: new optimization for 8-bit
hevc_epel_uni_w_hv
Signed-off-by: Logan Lyu <logan....@myais.com.cn>
---
libavcodec/aarch64/hevcdsp_epel_neon.S | 694 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 6 +
2 files changed, 700 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 8b6f396a0b..355679af29 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -717,6 +717,700 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm,
export=1
ret
endfunc
+.macro epel_uni_w_hv_start
+ mov x15, x5 //denom
+ mov x16, x6 //wx
+ mov x17, x7 //ox
+ add w15, w15, #6 //shift = denom+6
+
+
+ ldp x5, x6, [sp]
+ ldr x7, [sp, #16]
+
+ stp q12, q13, [sp, #-128]!
+ stp q14, q15, [sp, #32]
+ stp q8, q9, [sp, #64]
+ stp q10, q11, [sp, #96]
+
+ dup v13.8h, w16 //wx
+ dup v14.4s, w17 //ox
+
+ mov w17, #1
+ lsl w17, w17, w15
+ lsr w17, w17, #1
+ dup v15.4s, w17
+
+ neg w15, w15 // -shift
+ dup v12.4s, w15 //shift
+.endm
+
+.macro epel_uni_w_hv_end
+ smull v28.4s, v4.4h, v13.4h
+ smull2 v29.4s, v4.8h, v13.8h
+ add v28.4s, v28.4s, v15.4s
+ add v29.4s, v29.4s, v15.4s
+ sshl v28.4s, v28.4s, v12.4s
+ sshl v29.4s, v29.4s, v12.4s
+ add v28.4s, v28.4s, v14.4s
+ add v29.4s, v29.4s, v14.4s
+ sqxtn v4.4h, v28.4s
+ sqxtn2 v4.8h, v29.4s
+.endm
+
+.macro epel_uni_w_hv_end2
+ smull v28.4s, v4.4h, v13.4h
+ smull2 v29.4s, v4.8h, v13.8h
+ smull v30.4s, v5.4h, v13.4h
+ smull2 v31.4s, v5.8h, v13.8h
+ add v28.4s, v28.4s, v15.4s
+ add v29.4s, v29.4s, v15.4s
+ add v30.4s, v30.4s, v15.4s
+ add v31.4s, v31.4s, v15.4s
+
+ sshl v28.4s, v28.4s, v12.4s
+ sshl v29.4s, v29.4s, v12.4s
+ sshl v30.4s, v30.4s, v12.4s
+ sshl v31.4s, v31.4s, v12.4s
+
+ add v28.4s, v28.4s, v14.4s
+ add v29.4s, v29.4s, v14.4s
+ add v30.4s, v30.4s, v14.4s
+ add v31.4s, v31.4s, v14.4s
+
+ sqxtn v4.4h, v28.4s
+ sqxtn2 v4.8h, v29.4s
+ sqxtn v5.4h, v30.4s
+ sqxtn2 v5.8h, v31.4s
+.endm
+
+.macro epel_uni_w_hv_end3
+ smull v1.4s, v4.4h, v13.4h
+ smull2 v2.4s, v4.8h, v13.8h
+ smull v28.4s, v5.4h, v13.4h
+ smull2 v29.4s, v5.8h, v13.8h
+ smull v30.4s, v6.4h, v13.4h
+ smull2 v31.4s, v6.8h, v13.8h
+ add v1.4s, v1.4s, v15.4s
+ add v2.4s, v2.4s, v15.4s
+ add v28.4s, v28.4s, v15.4s
+ add v29.4s, v29.4s, v15.4s
+ add v30.4s, v30.4s, v15.4s
+ add v31.4s, v31.4s, v15.4s
+
+ sshl v1.4s, v1.4s, v12.4s
+ sshl v2.4s, v2.4s, v12.4s
+ sshl v28.4s, v28.4s, v12.4s
+ sshl v29.4s, v29.4s, v12.4s
+ sshl v30.4s, v30.4s, v12.4s
+ sshl v31.4s, v31.4s, v12.4s
+ add v1.4s, v1.4s, v14.4s
+ add v2.4s, v2.4s, v14.4s
+ add v28.4s, v28.4s, v14.4s
+ add v29.4s, v29.4s, v14.4s
+ add v30.4s, v30.4s, v14.4s
+ add v31.4s, v31.4s, v14.4s
+
+ sqxtn v4.4h, v1.4s
+ sqxtn2 v4.8h, v2.4s
+ sqxtn v5.4h, v28.4s
+ sqxtn2 v5.8h, v29.4s
+ sqxtn v6.4h, v30.4s
+ sqxtn2 v6.8h, v31.4s
+.endm
+
+.macro calc_epelh dst, src0, src1, src2, src3
+ smull \dst\().4s, \src0\().4h, v0.h[0]
+ smlal \dst\().4s, \src1\().4h, v0.h[1]
+ smlal \dst\().4s, \src2\().4h, v0.h[2]
+ smlal \dst\().4s, \src3\().4h, v0.h[3]
+ sqshrn \dst\().4h, \dst\().4s, #6
+.endm
+
+.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
+ smull2 \tmp\().4s, \src0\().8h, v0.h[0]
+ smlal2 \tmp\().4s, \src1\().8h, v0.h[1]
+ smlal2 \tmp\().4s, \src2\().8h, v0.h[2]
+ smlal2 \tmp\().4s, \src3\().8h, v0.h[3]
+ sqshrn2 \dst\().8h, \tmp\().4s, #6
+.endm
+
+.macro load_epel_filterh freg, xreg
+ movrel \xreg, epel_filters
+ add \xreg, \xreg, \freg, lsl #2
+ ld1 {v0.8b}, [\xreg]
+ sxtl v0.8h, v0.8b
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
+ epel_uni_w_hv_start
+ sxtw x4, w4
+
+ add x10, x4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp xzr, x30, [sp, #-48]!
+ stp x4, x6, [sp, #16]
+ stp x0, x1, [sp, #32]
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add x3, x4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+ ldp x4, x6, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ ldp xzr, x30, [sp], #48
+ load_epel_filterh x6, x5
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.4h}, [sp], x10
+ ld1 {v17.4h}, [sp], x10
+ ld1 {v18.4h}, [sp], x10
+1: ld1 {v19.4h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v16, v17, v18, v19
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ str s4, [x0]
+ add x0, x0, x1
+ b.eq 2f
+
+ ld1 {v16.4h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v17, v18, v19, v16
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ str s4, [x0]
+ add x0, x0, x1
+ b.eq 2f
+
+ ld1 {v17.4h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v18, v19, v16, v17
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ str s4, [x0]
+ add x0, x0, x1
+ b.eq 2f
+
+ ld1 {v18.4h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v19, v16, v17, v18
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ str s4, [x0]
+ add x0, x0, x1
+ b.ne 1b
+2:
+ ldp q14, q15, [sp, #32]
+ ldp q8, q9, [sp, #64]
+ ldp q10, q11, [sp, #96]
+ ldp q12, q13, [sp], #128
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv6_8_neon_i8mm, export=1
+ epel_uni_w_hv_start
+ sxtw x4, w4
+
+ add x10, x4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp xzr, x30, [sp, #-48]!
+ stp x4, x6, [sp, #16]
+ stp x0, x1, [sp, #32]
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add x3, x4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+ ldp x4, x6, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ ldp xzr, x30, [sp], #48
+ load_epel_filterh x6, x5
+ sub x1, x1, #4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h}, [sp], x10
+ ld1 {v17.8h}, [sp], x10
+ ld1 {v18.8h}, [sp], x10
+1: ld1 {v19.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v16, v17, v18, v19
+ calc_epelh2 v4, v5, v16, v17, v18, v19
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ st1 {v4.s}[0], [x0], #4
+ st1 {v4.h}[2], [x0], x1
+ b.eq 2f
+
+ ld1 {v16.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v17, v18, v19, v16
+ calc_epelh2 v4, v5, v17, v18, v19, v16
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ st1 {v4.s}[0], [x0], #4
+ st1 {v4.h}[2], [x0], x1
+ b.eq 2f
+
+ ld1 {v17.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v18, v19, v16, v17
+ calc_epelh2 v4, v5, v18, v19, v16, v17
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ st1 {v4.s}[0], [x0], #4
+ st1 {v4.h}[2], [x0], x1
+ b.eq 2f
+
+ ld1 {v18.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v19, v16, v17, v18
+ calc_epelh2 v4, v5, v19, v16, v17, v18
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ st1 {v4.s}[0], [x0], #4
+ st1 {v4.h}[2], [x0], x1
+ b.ne 1b
+2:
+ ldp q14, q15, [sp, #32]
+ ldp q8, q9, [sp, #64]
+ ldp q10, q11, [sp, #96]
+ ldp q12, q13, [sp], #128
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv8_8_neon_i8mm, export=1
+ epel_uni_w_hv_start
+ sxtw x4, w4
+
+ add x10, x4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp xzr, x30, [sp, #-48]!
+ stp x4, x6, [sp, #16]
+ stp x0, x1, [sp, #32]
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add x3, x4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+ ldp x4, x6, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ ldp xzr, x30, [sp], #48
+ load_epel_filterh x6, x5
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h}, [sp], x10
+ ld1 {v17.8h}, [sp], x10
+ ld1 {v18.8h}, [sp], x10
+1: ld1 {v19.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v16, v17, v18, v19
+ calc_epelh2 v4, v5, v16, v17, v18, v19
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ st1 {v4.8b}, [x0], x1
+ b.eq 2f
+
+ ld1 {v16.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v17, v18, v19, v16
+ calc_epelh2 v4, v5, v17, v18, v19, v16
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ st1 {v4.8b}, [x0], x1
+ b.eq 2f
+
+ ld1 {v17.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v18, v19, v16, v17
+ calc_epelh2 v4, v5, v18, v19, v16, v17
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ st1 {v4.8b}, [x0], x1
+ b.eq 2f
+
+ ld1 {v18.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v19, v16, v17, v18
+ calc_epelh2 v4, v5, v19, v16, v17, v18
+ epel_uni_w_hv_end
+ sqxtun v4.8b, v4.8h
+ st1 {v4.8b}, [x0], x1
+ b.ne 1b
+2:
+ ldp q14, q15, [sp, #32]
+ ldp q8, q9, [sp, #64]
+ ldp q10, q11, [sp, #96]
+ ldp q12, q13, [sp], #128
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv12_8_neon_i8mm, export=1
+ epel_uni_w_hv_start
+ sxtw x4, w4
+
+ add x10, x4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp xzr, x30, [sp, #-48]!
+ stp x4, x6, [sp, #16]
+ stp x0, x1, [sp, #32]
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add x3, x4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
+ ldp x4, x6, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ ldp xzr, x30, [sp], #48
+ load_epel_filterh x6, x5
+ sub x1, x1, #8
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+1: ld1 {v22.8h, v23.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v16, v18, v20, v22
+ calc_epelh2 v4, v5, v16, v18, v20, v22
+ calc_epelh v5, v17, v19, v21, v23
+ epel_uni_w_hv_end2
+ sqxtun v4.8b, v4.8h
+ sqxtun2 v4.16b, v5.8h
+ st1 {v4.8b}, [x0], #8
+ st1 {v4.s}[2], [x0], x1
+ b.eq 2f
+
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v18, v20, v22, v16
+ calc_epelh2 v4, v5, v18, v20, v22, v16
+ calc_epelh v5, v19, v21, v23, v17
+ epel_uni_w_hv_end2
+ sqxtun v4.8b, v4.8h
+ sqxtun2 v4.16b, v5.8h
+ st1 {v4.8b}, [x0], #8
+ st1 {v4.s}[2], [x0], x1
+ b.eq 2f
+
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v20, v22, v16, v18
+ calc_epelh2 v4, v5, v20, v22, v16, v18
+ calc_epelh v5, v21, v23, v17, v19
+ epel_uni_w_hv_end2
+ sqxtun v4.8b, v4.8h
+ sqxtun2 v4.16b, v5.8h
+ st1 {v4.8b}, [x0], #8
+ st1 {v4.s}[2], [x0], x1
+ b.eq 2f
+
+ ld1 {v20.8h, v21.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v22, v16, v18, v20
+ calc_epelh2 v4, v5, v22, v16, v18, v20
+ calc_epelh v5, v23, v17, v19, v21
+ epel_uni_w_hv_end2
+ sqxtun v4.8b, v4.8h
+ sqxtun2 v4.16b, v5.8h
+ st1 {v4.8b}, [x0], #8
+ st1 {v4.s}[2], [x0], x1
+ b.ne 1b
+2:
+ ldp q14, q15, [sp, #32]
+ ldp q8, q9, [sp, #64]
+ ldp q10, q11, [sp, #96]
+ ldp q12, q13, [sp], #128
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm, export=1
+ epel_uni_w_hv_start
+ sxtw x4, w4
+
+ add x10, x4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp xzr, x30, [sp, #-48]!
+ stp x4, x6, [sp, #16]
+ stp x0, x1, [sp, #32]
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add x3, x4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
+ ldp x4, x6, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ ldp xzr, x30, [sp], #48
+ load_epel_filterh x6, x5
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+1: ld1 {v22.8h, v23.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v16, v18, v20, v22
+ calc_epelh2 v4, v5, v16, v18, v20, v22
+ calc_epelh v5, v17, v19, v21, v23
+ calc_epelh2 v5, v6, v17, v19, v21, v23
+ epel_uni_w_hv_end2
+ sqxtun v4.8b, v4.8h
+ sqxtun2 v4.16b, v5.8h
+ st1 {v4.16b}, [x0], x1
+ b.eq 2f
+
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v18, v20, v22, v16
+ calc_epelh2 v4, v5, v18, v20, v22, v16
+ calc_epelh v5, v19, v21, v23, v17
+ calc_epelh2 v5, v6, v19, v21, v23, v17
+ epel_uni_w_hv_end2
+ sqxtun v4.8b, v4.8h
+ sqxtun2 v4.16b, v5.8h
+ st1 {v4.16b}, [x0], x1
+ b.eq 2f
+
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v20, v22, v16, v18
+ calc_epelh2 v4, v5, v20, v22, v16, v18
+ calc_epelh v5, v21, v23, v17, v19
+ calc_epelh2 v5, v6, v21, v23, v17, v19
+ epel_uni_w_hv_end2
+ sqxtun v4.8b, v4.8h
+ sqxtun2 v4.16b, v5.8h
+ st1 {v4.16b}, [x0], x1
+ b.eq 2f
+
+ ld1 {v20.8h, v21.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v22, v16, v18, v20
+ calc_epelh2 v4, v5, v22, v16, v18, v20
+ calc_epelh v5, v23, v17, v19, v21
+ calc_epelh2 v5, v6, v23, v17, v19, v21
+ epel_uni_w_hv_end2
+ sqxtun v4.8b, v4.8h
+ sqxtun2 v4.16b, v5.8h
+ st1 {v4.16b}, [x0], x1
+ b.ne 1b
+2:
+ ldp q14, q15, [sp, #32]
+ ldp q8, q9, [sp, #64]
+ ldp q10, q11, [sp, #96]
+ ldp q12, q13, [sp], #128
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1
+ epel_uni_w_hv_start
+ sxtw x4, w4
+
+ add x10, x4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp xzr, x30, [sp, #-48]!
+ stp x4, x6, [sp, #16]
+ stp x0, x1, [sp, #32]
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add x3, x4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
+ ldp x4, x6, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ ldp xzr, x30, [sp], #48
+ load_epel_filterh x6, x5
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
+ ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
+ ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
+1: ld1 {v25.8h, v26.8h, v27.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v16, v19, v22, v25
+ calc_epelh2 v4, v5, v16, v19, v22, v25
+ calc_epelh v5, v17, v20, v23, v26
+ calc_epelh2 v5, v6, v17, v20, v23, v26
+ calc_epelh v6, v18, v21, v24, v27
+ calc_epelh2 v6, v7, v18, v21, v24, v27
+
+ epel_uni_w_hv_end3
+ sqxtun v4.8b, v4.8h
+ sqxtun v5.8b, v5.8h
+ sqxtun v6.8b, v6.8h
+ st1 {v4.8b, v5.8b, v6.8b}, [x0], x1
+ b.eq 2f
+
+ ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v19, v22, v25, v16
+ calc_epelh2 v4, v5, v19, v22, v25, v16
+ calc_epelh v5, v20, v23, v26, v17
+ calc_epelh2 v5, v6, v20, v23, v26, v17
+ calc_epelh v6, v21, v24, v27, v18
+ calc_epelh2 v6, v7, v21, v24, v27, v18
+ epel_uni_w_hv_end3
+
+ sqxtun v4.8b, v4.8h
+ sqxtun v5.8b, v5.8h
+ sqxtun v6.8b, v6.8h
+ st1 {v4.8b, v5.8b, v6.8b}, [x0], x1
+ b.eq 2f
+
+ ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v22, v25, v16, v19
+ calc_epelh2 v4, v5, v22, v25, v16, v19
+ calc_epelh v5, v23, v26, v17, v20
+ calc_epelh2 v5, v6, v23, v26, v17, v20
+ calc_epelh v6, v24, v27, v18, v21
+ calc_epelh2 v6, v7, v24, v27, v18, v21
+ epel_uni_w_hv_end3
+
+ sqxtun v4.8b, v4.8h
+ sqxtun v5.8b, v5.8h
+ sqxtun v6.8b, v6.8h
+ st1 {v4.8b, v5.8b, v6.8b}, [x0], x1
+ b.eq 2f
+
+ ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
+ subs x4, x4, #1
+ calc_epelh v4, v25, v16, v19, v22
+ calc_epelh2 v4, v5, v25, v16, v19, v22
+ calc_epelh v5, v26, v17, v20, v23
+ calc_epelh2 v5, v6, v26, v17, v20, v23
+ calc_epelh v6, v27, v18, v21, v24
+ calc_epelh2 v6, v7, v27, v18, v21, v24
+ epel_uni_w_hv_end3
+
+ sqxtun v4.8b, v4.8h
+ sqxtun v5.8b, v5.8h
+ sqxtun v6.8b, v6.8h
+ st1 {v4.8b, v5.8b, v6.8b}, [x0], x1
+ b.ne 1b
+2:
+ ldp q14, q15, [sp, #32]
+ ldp q8, q9, [sp, #64]
+ ldp q10, q11, [sp, #96]
+ ldp q12, q13, [sp], #128
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1
+ ldp x15, x16, [sp]
+ stp x0, x30, [sp, #-16]!
+ stp x1, x2, [sp, #-16]!
+ stp x3, x4, [sp, #-16]!
+ stp x5, x6, [sp, #-16]!
+ mov x17, #16
+ stp x17, x7, [sp, #-16]!
+ stp x15, x16, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+ ldp x15, x16, [sp], #16
+ ldp x17, x7, [sp], #16
+ ldp x5, x6, [sp], #16
+ ldp x3, x4, [sp], #16
+ ldp x1, x2, [sp], #16
+ ldr x0, [sp]
+ add x0, x0, #16
+ add x2, x2, #16
+ mov x17, #16
+ stp x17, xzr, [sp, #-16]!
+ stp x15, x16, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+ ldp x15, x16, [sp], #16
+ ldp x17, xzr, [sp], #16
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv48_8_neon_i8mm, export=1
+ ldp x15, x16, [sp]
+ stp x0, x30, [sp, #-16]!
+ stp x1, x2, [sp, #-16]!
+ stp x3, x4, [sp, #-16]!
+ stp x5, x6, [sp, #-16]!
+ mov x17, #24
+ stp x17, x7, [sp, #-16]!
+ stp x15, x16, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
+ ldp x15, x16, [sp], #16
+ ldp x17, x7, [sp], #16
+ ldp x5, x6, [sp], #16
+ ldp x3, x4, [sp], #16
+ ldp x1, x2, [sp], #16
+ ldr x0, [sp]
+ add x0, x0, #24
+ add x2, x2, #24
+ mov x17, #24
+ stp x17, xzr, [sp, #-16]!
+ stp x15, x16, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
+ ldp x15, x16, [sp], #16
+ ldp x17, xzr, [sp], #16
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv64_8_neon_i8mm, export=1
+ ldp x15, x16, [sp]
+ stp x0, x30, [sp, #-16]!
+ stp x1, x2, [sp, #-16]!
+ stp x3, x4, [sp, #-16]!
+ stp x5, x6, [sp, #-16]!
+ mov x17, #16
+ stp x17, x7, [sp, #-16]!
+ stp x15, x16, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+ ldp x15, x16, [sp], #16
+ ldp x17, x7, [sp], #16
+ ldp x5, x6, [sp]
+ ldp x3, x4, [sp, #16]
+ ldp x1, x2, [sp, #32]
+ ldr x0, [sp, #48]
+ add x0, x0, #16
+ add x2, x2, #16
+ mov x17, #16
+ stp x17, x7, [sp, #-16]!
+ stp x15, x16, [sp, #-16]!
+
+ bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+ ldp x15, x16, [sp], #16
+ ldp x17, x7, [sp], #16
+ ldp x5, x6, [sp]
+ ldp x3, x4, [sp, #16]
+ ldp x1, x2, [sp, #32]
+ ldr x0, [sp, #48]
+ add x0, x0, #32
+ add x2, x2, #32
+ mov x17, #16
+ stp x17, x7, [sp, #-16]!
+ stp x15, x16, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+ ldp x15, x16, [sp], #16
+ ldp x17, x7, [sp], #16
+ ldp x5, x6, [sp], #16
+ ldp x3, x4, [sp], #16
+ ldp x1, x2, [sp], #16
+ ldr x0, [sp]
+ add x0, x0, #48
+ add x2, x2, #48
+ mov x17, #16
+ stp x17, xzr, [sp, #-16]!
+ stp x15, x16, [sp, #-16]!
+ bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+ ldp x15, x16, [sp], #16
+ ldp x17, xzr, [sp], #16
+ ldp xzr, x30, [sp], #16
+ ret
+endfunc
+
+
#endif
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index b448d755b9..e125b0cfb2 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t
_dststride,
int height, int denom, int wx, int ox,
intptr_t mx, intptr_t my, int width), _i8mm);
+NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride,
+ const uint8_t *_src, ptrdiff_t _srcstride,
+ int height, int denom, int wx, int ox,
+ intptr_t mx, intptr_t my, int width), _i8mm);
+
NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
@@ -291,6 +296,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c,
const int bit_depth)
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+ NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm);
NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1,
qpel_uni_w_hv, _i8mm);
}
--
2.38.0.windows.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".