On Mon, 23 Sep 2024, Zhao Zhili wrote:

From: Zhao Zhili <zhiliz...@tencent.com>

dmvr_hv_8_12x20_c:                                       8.0 ( 1.00x)
dmvr_hv_8_12x20_neon:                                    1.2 ( 6.62x)
dmvr_hv_8_20x12_c:                                       8.0 ( 1.00x)
dmvr_hv_8_20x12_neon:                                    0.9 ( 8.37x)
dmvr_hv_8_20x20_c:                                      12.9 ( 1.00x)
dmvr_hv_8_20x20_neon:                                    1.7 ( 7.62x)
dmvr_hv_10_12x20_c:                                      7.0 ( 1.00x)
dmvr_hv_10_12x20_neon:                                   1.7 ( 4.09x)
dmvr_hv_10_20x12_c:                                      7.0 ( 1.00x)
dmvr_hv_10_20x12_neon:                                   1.7 ( 4.09x)
dmvr_hv_10_20x20_c:                                     11.2 ( 1.00x)
dmvr_hv_10_20x20_neon:                                   2.7 ( 4.15x)
dmvr_hv_12_12x20_c:                                      6.5 ( 1.00x)
dmvr_hv_12_12x20_neon:                                   1.7 ( 3.79x)
dmvr_hv_12_20x12_c:                                      6.5 ( 1.00x)
dmvr_hv_12_20x12_neon:                                   1.7 ( 3.79x)
dmvr_hv_12_20x20_c:                                     10.2 ( 1.00x)
dmvr_hv_12_20x20_neon:                                   2.2 ( 4.64x)
---
libavcodec/aarch64/vvc/dsp_init.c |  12 ++
libavcodec/aarch64/vvc/inter.S    | 307 ++++++++++++++++++++++++++++++
2 files changed, 319 insertions(+)

diff --git a/libavcodec/aarch64/vvc/dsp_init.c 
b/libavcodec/aarch64/vvc/dsp_init.c
index b39ebb83fc..995e26d163 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -83,6 +83,15 @@ W_AVG_FUN(8)
W_AVG_FUN(10)
W_AVG_FUN(12)

+#define DMVR_FUN(fn, bd) \
+    void ff_vvc_dmvr_ ## fn ## bd ## _neon(int16_t *dst, \
+        const uint8_t *_src, const ptrdiff_t _src_stride, const int height, \
+        const intptr_t mx, const intptr_t my, const int width);

Unnecessary const on scalar parameters

+
+DMVR_FUN(hv_, 8)
+DMVR_FUN(hv_, 10)
+DMVR_FUN(hv_, 12)
+
void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
{
    int cpu_flags = av_get_cpu_flags();
@@ -155,6 +164,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)

        c->inter.avg = ff_vvc_avg_8_neon;
        c->inter.w_avg = vvc_w_avg_8;
+        c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;

        for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
            c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
@@ -196,12 +206,14 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, 
const int bd)
    } else if (bd == 10) {
        c->inter.avg = ff_vvc_avg_10_neon;
        c->inter.w_avg = vvc_w_avg_10;
+        c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;

        c->alf.filter[LUMA] = alf_filter_luma_10_neon;
        c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
    } else if (bd == 12) {
        c->inter.avg = ff_vvc_avg_12_neon;
        c->inter.w_avg = vvc_w_avg_12;
+        c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;

        c->alf.filter[LUMA] = alf_filter_luma_12_neon;
        c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index c4c6ab1a72..a0bb356f07 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -226,3 +226,310 @@ vvc_avg avg, 12
vvc_avg w_avg, 8
vvc_avg w_avg, 10
vvc_avg w_avg, 12
+
+/* x0: int16_t *dst
+ * x1: const uint8_t *_src
+ * x2: const ptrdiff_t _src_stride
+ * w3: const int height
+ * x4: const intptr_t mx
+ * x5: const intptr_t my
+ * w6: const int width

Unnecessary const

+ */
+function ff_vvc_dmvr_hv_8_neon, export=1
+        dst             .req x0
+        src             .req x1
+        src_stride      .req x2
+        height          .req w3
+        mx              .req x4
+        my              .req x5
+        width           .req w6
+        tmp0            .req x7
+        tmp1            .req x8
+
+        sub             sp, sp, #(VVC_MAX_PB_SIZE * 4)
+
+        movrel          x9, X(ff_vvc_inter_luma_dmvr_filters)
+        add             x12, x9, mx, lsl #1
+        ldrb            w10, [x12]
+        ldrb            w11, [x12, #1]
+        mov             tmp0, sp
+        add             tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
+        // We know the value are positive
+        dup             v0.8h, w10                  // filter_x[0]
+        dup             v1.8h, w11                  // filter_x[1]

If we don't need these values in GPRs, we could also just do ld1r, although that requires incrementing the pointer (which probably can be done with a post-increment, [x12], #1) between the loads. Then again, I see you load 8 bits but you want them in 16 bit elements, so that would require a separate uxtl. So then I guess this use of GPRs for loading is reasonable.

All in all, the patch seems fine, except for the unnecessary consts.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to