Hi, Martin,
I modified it according to your comments. Please review again.
And here are the checkasm benchmark results of the related functions:
The platform I tested is the g8y instance of Alibaba Cloud, with a chip
based on armv9.
put_hevc_pel_uni_pixels4_8_c: 35.9
put_hevc_pel_uni_pixels4_8_neon: 7.6
put_hevc_pel_uni_pixels6_8_c: 46.1
put_hevc_pel_uni_pixels6_8_neon: 20.6
put_hevc_pel_uni_pixels8_8_c: 53.4
put_hevc_pel_uni_pixels8_8_neon: 11.6
put_hevc_pel_uni_pixels12_8_c: 89.1
put_hevc_pel_uni_pixels12_8_neon: 25.9
put_hevc_pel_uni_pixels16_8_c: 106.4
put_hevc_pel_uni_pixels16_8_neon: 20.4
put_hevc_pel_uni_pixels24_8_c: 137.6
put_hevc_pel_uni_pixels24_8_neon: 47.1
put_hevc_pel_uni_pixels32_8_c: 173.6
put_hevc_pel_uni_pixels32_8_neon: 54.1
put_hevc_pel_uni_pixels48_8_c: 268.1
put_hevc_pel_uni_pixels48_8_neon: 117.1
put_hevc_pel_uni_pixels64_8_c: 346.1
put_hevc_pel_uni_pixels64_8_neon: 205.9
在 2023/6/12 15:47, Martin Storsjö 写道:
On Sun, 4 Jun 2023, logan....@myais.com.cn wrote:
From: Logan Lyu <logan....@myais.com.cn>
Signed-off-by: Logan Lyu <logan....@myais.com.cn>
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 ++
libavcodec/aarch64/hevcdsp_qpel_neon.S | 104 ++++++++++++++++++++++
2 files changed, 109 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 483a9d5253..5a1d520eec 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -152,6 +152,9 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t
*_dst, ptrdiff_t _dststride, co
void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+ const uint8_t *_src, ptrdiff_t _srcstride,
+ int height, intptr_t mx, intptr_t my, int width),);
NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
@@ -263,6 +266,8 @@ av_cold void
ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_bi[8][0][1] =
c->put_hevc_qpel_bi[9][0][1] =
ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+ NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
+ NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0,
qpel_uni_w_v,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index ed659cfe9b..6ca05b7201 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -490,6 +490,110 @@ put_hevc qpel
put_hevc qpel_uni
put_hevc qpel_bi
+function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
+1:
+ ldr s0, [x2]
+ ldr s1, [x2, x3]
+ add x2, x2, x3, lsl #1
+ str s0, [x0]
+ str s1, [x0, x1]
+ add x0, x0, x1, lsl #1
+ subs w4, w4, #2
+ b.hi 1b
+ ret
+endfunc
In a loop like this, I would recommend moving the "subs" instruction
further away from the branch that depends on it. For cores with
in-order execution, it does matter a fair bit, while it probably
doesn't for cores with out-of-order execution. Here, the ideal
location probably is after the two loads at the start. The same thing
goes for all the other functions in this patch.
Other than that, this looks ok.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
From a654b41fd8b100f631db49bd419ef65594ef32b3 Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Sun, 7 May 2023 16:58:30 +0800
Subject: [PATCH 1/5] lavc/aarch64: new optimization for 8-bit
hevc_pel_uni_pixels
Signed-off-by: Logan Lyu <logan....@myais.com.cn>
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 ++
libavcodec/aarch64/hevcdsp_qpel_neon.S | 104 ++++++++++++++++++++++
2 files changed, 109 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 483a9d5253..5a1d520eec 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -152,6 +152,9 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst,
ptrdiff_t _dststride, co
void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+ const uint8_t *_src, ptrdiff_t _srcstride,
+ int height, intptr_t mx, intptr_t my, int width),);
NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
@@ -263,6 +266,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c,
const int bit_depth)
c->put_hevc_qpel_bi[8][0][1] =
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+ NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
+ NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index ed659cfe9b..ed5b5027db 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -490,6 +490,110 @@ put_hevc qpel
put_hevc qpel_uni
put_hevc qpel_bi
+function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
+1:
+ ldr s0, [x2]
+ ldr s1, [x2, x3]
+ subs w4, w4, #2
+ add x2, x2, x3, lsl #1
+ str s0, [x0]
+ str s1, [x0, x1]
+ add x0, x0, x1, lsl #1
+ b.hi 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1
+ sub x1, x1, #4
+1:
+ ldr d0, [x2]
+ ldr d1, [x2, x3]
+ subs w4, w4, #2
+ add x2, x2, x3, lsl #1
+ str s0, [x0], #4
+ st1 {v0.h}[2], [x0], x1
+ str s1, [x0], #4
+ st1 {v1.h}[2], [x0], x1
+ b.hi 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1
+1:
+ ldr d0, [x2]
+ ldr d1, [x2, x3]
+ subs w4, w4, #2
+ add x2, x2, x3, lsl #1
+ str d0, [x0]
+ str d1, [x0, x1]
+ add x0, x0, x1, lsl #1
+ b.hi 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1
+ sub x1, x1, #8
+1:
+ ldr q0, [x2]
+ ldr q1, [x2, x3]
+ subs w4, w4, #2
+ add x2, x2, x3, lsl #1
+ str d0, [x0], #8
+ st1 {v0.s}[2], [x0], x1
+ str d1, [x0], #8
+ st1 {v1.s}[2], [x0], x1
+ b.hi 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1
+1:
+ ldr q0, [x2]
+ ldr q1, [x2, x3]
+ subs w4, w4, #2
+ add x2, x2, x3, lsl #1
+ str q0, [x0]
+ str q1, [x0, x1]
+ add x0, x0, x1, lsl #1
+ b.hi 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1
+1:
+ ld1 {v0.8b, v1.8b, v2.8b}, [x2], x3
+ subs w4, w4, #1
+ st1 {v0.8b, v1.8b, v2.8b}, [x0], x1
+ b.hi 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1
+1:
+ ld1 {v0.16b, v1.16b}, [x2], x3
+ subs w4, w4, #1
+ st1 {v0.16b, v1.16b}, [x0], x1
+ b.hi 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1
+1:
+ ld1 {v0.16b, v1.16b, v2.16b}, [x2], x3
+ subs w4, w4, #1
+ st1 {v0.16b, v1.16b, v2.16b}, [x0], x1
+ b.hi 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
+1:
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+ subs w4, w4, #1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ b.hi 1b
+ ret
+endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
mov w10, #-6
--
2.38.0.windows.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".