On Mon, 23 Sep 2024, Zhao Zhili wrote:
From: Zhao Zhili <zhiliz...@tencent.com>
dmvr_hv_8_12x20_c: 8.0 ( 1.00x)
dmvr_hv_8_12x20_neon: 1.2 ( 6.62x)
dmvr_hv_8_20x12_c: 8.0 ( 1.00x)
dmvr_hv_8_20x12_neon: 0.9 ( 8.37x)
dmvr_hv_8_20x20_c: 12.9 ( 1.00x)
dmvr_hv_8_20x20_neon: 1.7 ( 7.62x)
dmvr_hv_10_12x20_c: 7.0 ( 1.00x)
dmvr_hv_10_12x20_neon: 1.7 ( 4.09x)
dmvr_hv_10_20x12_c: 7.0 ( 1.00x)
dmvr_hv_10_20x12_neon: 1.7 ( 4.09x)
dmvr_hv_10_20x20_c: 11.2 ( 1.00x)
dmvr_hv_10_20x20_neon: 2.7 ( 4.15x)
dmvr_hv_12_12x20_c: 6.5 ( 1.00x)
dmvr_hv_12_12x20_neon: 1.7 ( 3.79x)
dmvr_hv_12_20x12_c: 6.5 ( 1.00x)
dmvr_hv_12_20x12_neon: 1.7 ( 3.79x)
dmvr_hv_12_20x20_c: 10.2 ( 1.00x)
dmvr_hv_12_20x20_neon: 2.2 ( 4.64x)
---
libavcodec/aarch64/vvc/dsp_init.c | 12 ++
libavcodec/aarch64/vvc/inter.S | 307 ++++++++++++++++++++++++++++++
2 files changed, 319 insertions(+)
diff --git a/libavcodec/aarch64/vvc/dsp_init.c
b/libavcodec/aarch64/vvc/dsp_init.c
index b39ebb83fc..995e26d163 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -83,6 +83,15 @@ W_AVG_FUN(8)
W_AVG_FUN(10)
W_AVG_FUN(12)
+#define DMVR_FUN(fn, bd) \
+ void ff_vvc_dmvr_ ## fn ## bd ## _neon(int16_t *dst, \
+ const uint8_t *_src, const ptrdiff_t _src_stride, const int height, \
+ const intptr_t mx, const intptr_t my, const int width);
Unnecessary const on scalar parameters
+
+DMVR_FUN(hv_, 8)
+DMVR_FUN(hv_, 10)
+DMVR_FUN(hv_, 12)
+
void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
{
int cpu_flags = av_get_cpu_flags();
@@ -155,6 +164,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const
int bd)
c->inter.avg = ff_vvc_avg_8_neon;
c->inter.w_avg = vvc_w_avg_8;
+ c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
@@ -196,12 +206,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c,
const int bd)
} else if (bd == 10) {
c->inter.avg = ff_vvc_avg_10_neon;
c->inter.w_avg = vvc_w_avg_10;
+ c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
} else if (bd == 12) {
c->inter.avg = ff_vvc_avg_12_neon;
c->inter.w_avg = vvc_w_avg_12;
+ c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index c4c6ab1a72..a0bb356f07 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -226,3 +226,310 @@ vvc_avg avg, 12
vvc_avg w_avg, 8
vvc_avg w_avg, 10
vvc_avg w_avg, 12
+
+/* x0: int16_t *dst
+ * x1: const uint8_t *_src
+ * x2: const ptrdiff_t _src_stride
+ * w3: const int height
+ * x4: const intptr_t mx
+ * x5: const intptr_t my
+ * w6: const int width
Unnecessary const
+ */
+function ff_vvc_dmvr_hv_8_neon, export=1
+ dst .req x0
+ src .req x1
+ src_stride .req x2
+ height .req w3
+ mx .req x4
+ my .req x5
+ width .req w6
+ tmp0 .req x7
+ tmp1 .req x8
+
+ sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
+
+ movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
+ add x12, x9, mx, lsl #1
+ ldrb w10, [x12]
+ ldrb w11, [x12, #1]
+ mov tmp0, sp
+ add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
+ // We know the value are positive
+ dup v0.8h, w10 // filter_x[0]
+ dup v1.8h, w11 // filter_x[1]
If we don't need these values in GPRs, we could also just do ld1r,
although that requires incrementing the pointer (which probably can be
done with a post-increment, [x12], #1) between the loads. Then again, I
see you load 8 bits but you want them in 16 bit elements, so that would
require a separate uxtl. So then I guess this use of GPRs for loading is
reasonable.
All in all, the patch seems fine, except for the unnecessary consts.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".