ops_asmgen: fold impl pointer increment into loading of continuation address (branch master)

Ramiro Polla via ffmpeg-cvslog Sat, 06 Jun 2026 04:56:20 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


The following commit(s) were added to refs/heads/master by this push:
     new 025d6330a5 swscale/aarch64/ops_asmgen: fold impl pointer increment 
into loading of continuation address
025d6330a5 is described below

commit 025d6330a5eb62c65a1320df762a26bd1b8960dd
Author:     Ramiro Polla <[email protected]>
AuthorDate: Mon Apr 13 23:28:09 2026 +0200
Commit:     Ramiro Polla <[email protected]>
CommitDate: Sat Jun 6 11:54:14 2026 +0000

    swscale/aarch64/ops_asmgen: fold impl pointer increment into loading of 
continuation address
    
    This commit reduces every kernel by one instruction, for example:
     function ff_sws_clear_8_u16_0001_neon, export=1, jumpable=1
    -        ldr             x0, [x1]                        // SwsFuncPtr cont 
= impl->cont;
             ldr             q16, [x1, #16]                  // v128 clear_vec 
= impl->priv.v128;
    +        ldr             x0, [x1], #32                   // SwsFuncPtr cont 
= (impl++)->cont;
             dup             v0.8h, v16.h[0]                 // vl[0] = 
broadcast(clear_vec[0])
    -        add             x1, x1, #32                     // impl += 1;
             br              x0                              // jump to cont
     endfunc
    
    A55: Overall speedup=1.066x faster, min=0.881x max=1.288x
    A76: Overall speedup=1.012x faster, min=0.570x max=1.546x
    
    The large min/max differences are due to pathological branch miss cases
    that happen either before of after this commit.
    
    Sponsored-by: Sovereign Tech Fund
    Signed-off-by: Ramiro Polla <[email protected]>
---
 libswscale/aarch64/ops_asmgen.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/libswscale/aarch64/ops_asmgen.c b/libswscale/aarch64/ops_asmgen.c
index cca1913630..e88a162de1 100644
--- a/libswscale/aarch64/ops_asmgen.c
+++ b/libswscale/aarch64/ops_asmgen.c
@@ -152,6 +152,7 @@ typedef struct SwsAArch64Context {
     RasmOp op0_func;
     RasmOp op1_impl;
     RasmOp cont;
+    RasmNode *load_cont_node;
 
     /* Vector registers. Two banks (low and high) are used. */
     RasmOp vl[ 4];
@@ -377,6 +378,18 @@ static void asmgen_process_return(SwsAArch64Context *s, 
const SwsAArch64OpImplPa
     i_ret(r);
 }
 
+/*********************************************************************/
+/**
+ * Set node where the continuation address will be loaded and impl will
+ * be incremented. This should be done right after impl->priv has been
+ * used.
+ */
+static void asmgen_set_load_cont_node(SwsAArch64Context *s)
+{
+    RasmContext *r = s->rctx;
+    s->load_cont_node = rasm_get_current_node(r);
+}
+
 /*********************************************************************/
 /* gather raw pixels from planes */
 /* AARCH64_SWS_OP_READ_BIT */
@@ -401,6 +414,7 @@ static void asmgen_op_read_bit(SwsAArch64Context *s, const 
SwsAArch64OpImplParam
      * ushl actually performs a right shift. */
     rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;");
     i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv));
+    asmgen_set_load_cont_node(s);
 
     if (p->block_size == 16) {
         i_ldrh(r, wtmp,        a64op_post(s->in[0], 2));    CMT("uint16_t tmp 
= *in[0]++;");
@@ -528,6 +542,7 @@ static void asmgen_op_write_bit(SwsAArch64Context *s, const 
SwsAArch64OpImplPara
 
     rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;");
     i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv));
+    asmgen_set_load_cont_node(s);
 
     if (p->block_size == 8) {
         i_ushl(r, vl[0].b8,    vl[0].b8,   shift_vec.b8);   CMT("vl[0] <<= 
shift_vec;");
@@ -896,6 +911,7 @@ static void asmgen_op_clear(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *
      */
 
     i_ldr(r, v_q(clear_vec), a64op_off(s->impl, offsetof_impl_priv));   
CMT("v128 clear_vec = impl->priv.v128;");
+    asmgen_set_load_cont_node(s);
 
     LOOP_MASK      (p, i) { i_dup(r, vl[i], a64op_elem(clear_vec, i));  
CMTF("vl[%u] = broadcast(clear_vec[%u])", i, i); }
     LOOP_MASK_VH(s, p, i) { i_dup(r, vh[i], a64op_elem(clear_vec, i));  
CMTF("vh[%u] = broadcast(clear_vec[%u])", i, i); }
@@ -1024,6 +1040,7 @@ static void asmgen_op_min(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
     RasmOp min_vec = s->vt[4];
 
     i_ldr(r, v_q(min_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 
min_vec = impl->priv.v128;");
+    asmgen_set_load_cont_node(s);
     LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(min_vec, i));      CMTF("v128 
vmin%u = min_vec[%u];", i, i); }
 
     if (p->type == AARCH64_PIXEL_F32) {
@@ -1048,6 +1065,7 @@ static void asmgen_op_max(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
     RasmOp max_vec = s->vt[4];
 
     i_ldr(r, v_q(max_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 
max_vec = impl->priv.v128;");
+    asmgen_set_load_cont_node(s);
     LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(max_vec, i));      CMTF("v128 
vmax%u = max_vec[%u];", i, i); }
 
     if (p->type == AARCH64_PIXEL_F32) {
@@ -1072,6 +1090,7 @@ static void asmgen_op_scale(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *
     RasmOp scale_vec = s->vt[0];
 
     i_add (r, priv_ptr, s->impl, IMM(offsetof_impl_priv));          CMT("v128 
*scale_vec_ptr = &impl->priv;");
+    asmgen_set_load_cont_node(s);
     i_ld1r(r, vv_1(scale_vec), a64op_base(priv_ptr));               CMT("v128 
scale_vec = broadcast(*scale_vec_ptr);");
 
     if (p->type == AARCH64_PIXEL_F32) {
@@ -1195,6 +1214,7 @@ static void asmgen_op_linear(SwsAArch64Context *s, const 
SwsAArch64OpImplParams
     case 4: coeff_veclist = vv_4(vc[0], vc[1], vc[2], vc[3]); break;
     }
     i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv));  CMT("v128 
*vcoeff_ptr = impl->priv.ptr;");
+    asmgen_set_load_cont_node(s);
     i_ld1(r, coeff_veclist, a64op_base(ptr));               CMT("coeff_veclist 
= *vcoeff_ptr;");
 
     /* Compute mask for rows that must be saved before being overwritten. */
@@ -1262,6 +1282,7 @@ static void asmgen_op_dither(SwsAArch64Context *s, const 
SwsAArch64OpImplParams
     }
 
     i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv));  CMT("void *ptr = 
impl->priv.ptr;");
+    asmgen_set_load_cont_node(s);
 
     /**
      * We use ubfiz to mask and shift left in one single instruction:
@@ -1365,7 +1386,7 @@ static void asmgen_op_cps(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
     reshape_all_vectors(s, s->el_count, el_size);
 
     /* Common start for continuation-passing style (CPS) functions. */
-    i_ldr(r, s->cont, a64op_off(s->impl, offsetof_impl_cont));  
CMT("SwsFuncPtr cont = impl->cont;");
+    asmgen_set_load_cont_node(s);
 
     switch (p->op) {
     case AARCH64_SWS_OP_READ_BIT:     asmgen_op_read_bit(s, p);     break;
@@ -1395,8 +1416,13 @@ static void asmgen_op_cps(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
         break;
     }
 
+    /* Load continuation address and increment impl pointer. */
+    RasmNode *node = rasm_set_current_node(r, s->load_cont_node);
+    RasmOp impl_post = a64op_post(s->impl, sizeof_impl);
+    i_ldr(r, s->cont, impl_post);                   CMT("SwsFuncPtr cont = 
(impl++)->cont;");
+    rasm_set_current_node(r, node);
+
     /* Common end for CPS functions. */
-    i_add(r, s->impl, s->impl, IMM(sizeof_impl));   CMT("impl += 1;");
     i_br (r, s->cont);                              CMT("jump to cont");
 }
 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] swscale/aarch64/ops_asmgen: fold impl pointer increment into loading of continuation address (branch master)

Reply via email to