The branch, master has been updated
       via  7b21bde34c4331b37a3e35832fb728e26c519bf4 (commit)
       via  189e841cfdb150cb6614e18454fcf1ed59e17604 (commit)
      from  1b97966199f797deee62cd3938feef93098005b2 (commit)


- Log -----------------------------------------------------------------
commit 7b21bde34c4331b37a3e35832fb728e26c519bf4
Author:     Krzysztof Pyrkosz <[email protected]>
AuthorDate: Sat Sep 6 00:49:21 2025 +0200
Commit:     jianhuaw <[email protected]>
CommitDate: Mon Sep 8 17:51:20 2025 +0000

    avcodec/aarch64/vvc: Implemented dmvr_h_10
    
    A78:
    dmvr_h_10_12x20_neon:                                   82.2 ( 6.49x)
    dmvr_h_10_20x12_neon:                                   69.9 ( 3.66x)
    dmvr_h_10_20x20_neon:                                  112.5 ( 3.74x)
    dmvr_h_12_12x20_neon:                                   81.4 ( 6.51x)
    dmvr_h_12_20x12_neon:                                   69.2 ( 3.74x)
    dmvr_h_12_20x20_neon:                                  110.2 ( 3.85x)
    
    A72:
    dmvr_h_10_12x20_neon:                                  234.1 ( 4.67x)
    dmvr_h_10_20x12_neon:                                  221.4 ( 3.48x)
    dmvr_h_10_20x20_neon:                                  356.9 ( 3.59x)
    dmvr_h_12_12x20_neon:                                  234.1 ( 4.67x)
    dmvr_h_12_20x12_neon:                                  221.5 ( 3.53x)
    dmvr_h_12_20x20_neon:                                  357.0 ( 3.64x)

diff --git a/libavcodec/aarch64/vvc/dsp_init.c 
b/libavcodec/aarch64/vvc/dsp_init.c
index 2e0e7434e8..08204063f9 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -96,6 +96,8 @@ W_AVG_FUN(12)
 DMVR_FUN(, 8)
 DMVR_FUN(, 12)
 DMVR_FUN(h_, 8)
+DMVR_FUN(h_, 10)
+DMVR_FUN(h_, 12)
 DMVR_FUN(hv_, 8)
 DMVR_FUN(hv_, 10)
 DMVR_FUN(hv_, 12)
@@ -234,6 +236,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
     } else if (bd == 10) {
         c->inter.avg = ff_vvc_avg_10_neon;
         c->inter.w_avg = vvc_w_avg_10;
+        c->inter.dmvr[0][1] = ff_vvc_dmvr_h_10_neon;
         c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
         c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon;
 
@@ -243,6 +246,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->inter.avg = ff_vvc_avg_12_neon;
         c->inter.w_avg = vvc_w_avg_12;
         c->inter.dmvr[0][0] = ff_vvc_dmvr_12_neon;
+        c->inter.dmvr[0][1] = ff_vvc_dmvr_h_12_neon;
         c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
         c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon;
 
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 393702373a..d59d278275 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -437,6 +437,67 @@ function ff_vvc_dmvr_h_8_neon, export=1
         ret
 endfunc
 
+.macro vvc_dmvr_h_10 bit_depth
+function ff_vvc_dmvr_h_\bit_depth\()_neon, export=1
+        movrel          x7, X(ff_vvc_inter_luma_dmvr_filters)
+        add             x7, x7, x4, lsl #1
+        ld2r            {v0.16b, v1.16b}, [x7]
+        uxtl            v0.8h, v0.8b
+        uxtl            v1.8h, v1.8b
+        tbz             w6, #4, 12f
+20:
+        ldur            q3, [x1, #2]
+        ldr             q2, [x1]
+        ldr             q22, [x1, #16]
+        mul             v4.8h, v0.8h, v2.8h
+        mul             v6.8h, v1.8h, v3.8h
+        ldur            q23, [x1, #18]
+        mul             v5.8h, v0.8h, v22.8h
+        ldur            d17, [x1, #34]
+        mul             v7.8h, v1.8h, v23.8h
+        uhadd           v4.8h, v4.8h, v6.8h
+        ldr             d16, [x1, #32]
+        uhadd           v5.8h, v5.8h, v7.8h
+        mul             v17.4h, v1.4h, v17.4h
+        mul             v16.4h, v0.4h, v16.4h
+        urshr           v4.8h, v4.8h, #(\bit_depth - 6 - 1)
+        urshr           v5.8h, v5.8h, #(\bit_depth - 6 - 1)
+        uhadd           v16.4h, v16.4h, v17.4h
+        urshr           v16.4h, v16.4h, #(\bit_depth - 6 - 1)
+        st1             {v4.8h, v5.8h}, [x0], #32
+        subs            w3, w3, #1
+        st1             {v16.4h}, [x0], #8
+        add             x1, x1, x2
+        add             x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
+        b.ne            20b
+        ret
+
+12:
+        ldur            q3, [x1, #2]
+        ldr             q2, [x1]
+        mul             v4.8h, v0.8h, v2.8h
+        ldur            d17, [x1, #18]
+        mul             v6.8h, v1.8h, v3.8h
+        ldr             d16, [x1, #16]
+        uhadd           v4.8h, v4.8h, v6.8h
+        mul             v17.4h, v1.4h, v17.4h
+        mul             v16.4h, v0.4h, v16.4h
+        urshr           v4.8h, v4.8h, #(\bit_depth - 6 - 1)
+        uhadd           v16.4h, v16.4h, v17.4h
+        urshr           v16.4h, v16.4h, #(\bit_depth - 6 - 1)
+        st1             {v4.8h}, [x0], #16
+        subs            w3, w3, #1
+        st1             {v16.4h}, [x0], #8
+        add             x1, x1, x2
+        add             x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
+        b.ne            12b
+        ret
+endfunc
+.endm
+
+vvc_dmvr_h_10 10
+vvc_dmvr_h_10 12
+
 function ff_vvc_dmvr_hv_8_neon, export=1
         tmp0            .req x7
         tmp1            .req x8

commit 189e841cfdb150cb6614e18454fcf1ed59e17604
Author:     Krzysztof Pyrkosz <[email protected]>
AuthorDate: Fri Sep 5 22:24:55 2025 +0200
Commit:     jianhuaw <[email protected]>
CommitDate: Mon Sep 8 17:51:20 2025 +0000

    avcodec/aarch64/vvc: Implement dmvr_h_8
    
    A78:
    dmvr_h_8_12x20_neon:                                    76.6 ( 4.31x)
    dmvr_h_8_20x12_neon:                                    65.8 ( 3.49x)
    dmvr_h_8_20x20_neon:                                   106.6 ( 3.62x)
    
    A72:
    dmvr_h_8_12x20_neon:                                   190.6 ( 4.40x)
    dmvr_h_8_20x12_neon:                                   171.1 ( 4.31x)
    dmvr_h_8_20x20_neon:                                   275.1 ( 4.50x)

diff --git a/libavcodec/aarch64/vvc/dsp_init.c 
b/libavcodec/aarch64/vvc/dsp_init.c
index df0b536539..2e0e7434e8 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -95,6 +95,7 @@ W_AVG_FUN(12)
 
 DMVR_FUN(, 8)
 DMVR_FUN(, 12)
+DMVR_FUN(h_, 8)
 DMVR_FUN(hv_, 8)
 DMVR_FUN(hv_, 10)
 DMVR_FUN(hv_, 12)
@@ -188,6 +189,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->inter.avg = ff_vvc_avg_8_neon;
         c->inter.w_avg = vvc_w_avg_8;
         c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon;
+        c->inter.dmvr[0][1] = ff_vvc_dmvr_h_8_neon;
         c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
         c->inter.apply_bdof = ff_vvc_apply_bdof_8_neon;
 
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index a6648b64fc..393702373a 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -385,6 +385,58 @@ function ff_vvc_dmvr_12_neon, export=1
         ret
 endfunc
 
+function ff_vvc_dmvr_h_8_neon, export=1
+        movrel          x7, X(ff_vvc_inter_luma_dmvr_filters)
+        add             x7, x7, x4, lsl #1
+        ld2r            {v0.16b, v1.16b}, [x7]
+        tbz             w6, #4, 12f
+20:
+        ldur            q3, [x1, #1]
+        ldr             q2, [x1]
+        umull           v4.8h, v0.8b, v2.8b
+        umull2          v5.8h, v0.16b, v2.16b
+        ldur            s17, [x1, #17]
+        umull           v6.8h, v1.8b, v3.8b
+        ldr             s16, [x1, #16]
+        umull2          v7.8h, v1.16b, v3.16b
+        add             v4.8h, v4.8h, v6.8h
+        umull           v17.8h, v1.8b, v17.8b
+        add             v5.8h, v5.8h, v7.8h
+        umull           v16.8h, v0.8b, v16.8b
+        srshr           v4.8h, v4.8h, #2
+        add             v16.4h, v16.4h, v17.4h
+        srshr           v5.8h, v5.8h, #2
+        srshr           v16.4h, v16.4h, #2
+        st1             {v4.8h, v5.8h}, [x0], #32
+        subs            w3, w3, #1
+        st1             {v16.4h}, [x0], #8
+        add             x1, x1, x2
+        add             x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
+        b.ne            20b
+        ret
+
+12:
+        ldur            d3, [x1, #1]
+        ldr             d2, [x1]
+        umull           v4.8h, v0.8b, v2.8b
+        ldur            s17, [x1, #9]
+        umull           v6.8h, v1.8b, v3.8b
+        ldr             s16, [x1, #8]
+        add             v4.8h, v4.8h, v6.8h
+        umull           v17.8h, v1.8b, v17.8b
+        umull           v16.8h, v0.8b, v16.8b
+        srshr           v4.8h, v4.8h, #2
+        add             v16.4h, v16.4h, v17.4h
+        srshr           v16.4h, v16.4h, #2
+        st1             {v4.8h}, [x0], #16
+        subs            w3, w3, #1
+        st1             {v16.4h}, [x0], #8
+        add             x1, x1, x2
+        add             x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
+        b.ne            12b
+        ret
+endfunc
+
 function ff_vvc_dmvr_hv_8_neon, export=1
         tmp0            .req x7
         tmp1            .req x8

-----------------------------------------------------------------------

Summary of changes:
 libavcodec/aarch64/vvc/dsp_init.c |   6 ++
 libavcodec/aarch64/vvc/inter.S    | 113 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+)


hooks/post-receive
-- 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to